📄 schedule.c
字号:
; } /* * Get the queue limits and utilization for each queue about which the * scheduler knows. Any jobs on schd_AllJobs (set by get_and_sort_jobs() * above) that belong to the queue will be placed on the queue->jobs * list. * * If PBS fails to provide us any information about a queue, treat it * as a fatal error. */ error = get_all_queue_info(3 /* Number of queue lists */, schd_SubmitQueue, schd_BatchQueues, schd_DedQueues); if (error < 0) { DBPRT(("get_all_queue_info() failed\n")); return (1); /* Bogus queue - don't recycle. */ } else if (error > 0) { DBPRT(("queue failed sanity check - wait and recycle.\n")); sleep(WAIT_FOR_QUEUE_SANITY); return (0); /* Attempt to recycle scheduler. */ } /* Fix added by jjones per Dr. Hook to change behavior of jobs being * moved from submit queue to exec queue before run, at request of * ERDC. */ fix_jim(schd_SubmitQueue->queue,schd_BatchQueues->queue); /* * At this point, schd_AllJobs should hold only orphan jobs (i.e. only * jobs that belong to queues about which the scheduler does not care). * Note it and go on scheduling -- unless nothing is being scheduled, * this is more-or-less meaningless. */ if (schd_AllJobs) { (void)sprintf(log_buffer, "Some jobs not claimed by queues."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n%s: Unclaimed jobs: ", id, log_buffer, id));#ifdef DEBUG for (this = schd_AllJobs; this != NULL; this = this->next) { DBPRT(("%s%s", this->jobid, this->next ? ", " : "")); } DBPRT(("\n"));#endif /* DEBUG */ } /* Dump the list of jobs being scheduled from submit queue. */ if (schd_JOB_DUMPFILE) { (void)sprintf(log_buffer, "Dumping sorted job information to %s", schd_JOB_DUMPFILE); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); make_job_dump(schd_JOB_DUMPFILE); } /* * Allocation and usage information are updated at [roughly] 2:00 AM * (Eastern time). Since they may have been updated, attempt to fetch * them again in the middle of the night. */ if (schd_NeedToGetDecayInfo) schd_decay_info("r"); /* get users' recent past usage */ if (schd_ENFORCE_ALLOCATION && schd_TimeNow >= schd_ENFORCE_ALLOCATION) { /* * If the allocations file has already been loaded, consult the file * timestamp to determine if it has changed. If so, flag that it * needs to be reloaded. */ if (!schd_NeedToGetAllocInfo && schd_AllocFilename) schd_NeedToGetAllocInfo = schd_file_has_changed(schd_AllocFilename, 1); if (!schd_NeedToGetYTDInfo && schd_CurrentFilename) schd_NeedToGetYTDInfo = schd_file_has_changed(schd_CurrentFilename, 1); /* If either file needs to be [re]loaded, do so. */ if (schd_NeedToGetAllocInfo || schd_NeedToGetYTDInfo) schd_alloc_info(); } /* * We need to save the past usage data periodically, so that a restart * of pbs_sched doesn't lose it ... */ if (schd_save_decay()) /* is it time yet ? */ schd_decay_info("w"); /* yep, so do it */ if (schd_SubmitQueue->queue->jobs && !(schd_SubmitQueue->queue->flags & (QFLAGS_DISABLED | QFLAGS_STOPPED))) { /* * Test each job against the set of execution queues. If it can * never be run in any queue, reject it immediately. This saves * the user having to wait for the scheduler to get around to being * able to run it. */ jobs = reject_unrunnables(schd_SubmitQueue->queue->jobs); /* * Look for queues whose execution hosts are in dedicated time. If * any are found, note that fact and continue. Otherwise, add them * to the normalQs list, which will be scheduled normally. If the * flag is set indicating that one or more hosts is in dedtime, they * will be scheduled after everything else is done. */ for (qptr = schd_BatchQueues; qptr != NULL; qptr = qptr->next) { if (schd_ENFORCE_DEDTIME && schd_TimeNow >= schd_ENFORCE_DEDTIME) outages = schd_host_outage(qptr->queue->exechost, 0); else outages = NULL; /* * Is there a scheduled outage right now for this host? If so, * note that fact and continue to the next queue. All of this * information is cached, so this isn't as expensive as it seems. */ if (outages != NULL) { if ((outages->beg_time <= schd_TimeNow) && (outages->end_time > schd_TimeNow)) { DBPRT(("%s: Host %s is in dedtime (from %s:%s to %s:%s)\n", id, outages->exechost, outages->beg_datestr, outages->beg_timestr, outages->end_datestr, outages->end_timestr)); DBPRT(("%s: Queue %s@%s will not be scheduled.\n", id, qptr->queue->qname, qptr->queue->exechost)); /* This exechost is in dedicated time, ignore the queue. */ hosts_in_dedtime ++; continue; } else if (outages->beg_time > schd_TimeNow) { /* Upcoming dedtime, but not yet. Schedule the queue. */ DBPRT(("%s: Host %s upcoming dedtime (at %s:%s to %s:%s)\n", id, outages->exechost, outages->beg_datestr, outages->beg_timestr, outages->end_datestr, outages->end_timestr)); } } /* * This host is not currently in dedicated time. Add it to the * tail of the list of queues to be scheduled. */ newqlp = (QueueList *)malloc(sizeof(QueueList)); if (newqlp == NULL) { (void)sprintf(log_buffer, "malloc(QueueList) for %s@%s failed", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); if (normalQs) schd_free_qlist(normalQs); return (1); } newqlp->queue = qptr->queue; if (normalQtail) normalQtail->next = newqlp; else normalQs = newqlp; normalQtail = newqlp; newqlp->next = NULL; } DBPRT(("%s: calling schedule_jobs(", id)); if (normalQs) { for (qptr = normalQs; qptr != NULL; qptr = qptr->next) DBPRT(("%s@%s%s", qptr->queue->qname, qptr->queue->exechost, qptr->next ? ", " : "")); } else { DBPRT(("<no batch queues>")); } DBPRT((")\n")); /* Now make the call to actually run some jobs */ total_ran += ran = schedule_jobs(normalQs, jobs, reason); if (ran < 0) { DBPRT(("Could not run any jobs!\n")); } else { DBPRT(("RAN %d jobs.\n", ran)); } if (normalQs) schd_free_qlist(normalQs); normalQs = normalQtail = NULL; } /* * Now check the dedtime queues with queued jobs for hosts that are * in dedicated time. If any are found, comment the jobs appropriately * and/or schedule them. */ for (qptr = schd_DedQueues; qptr != NULL; qptr = qptr->next) { if (qptr->queue->queued == 0) continue; DBPRT(("%s: schd_handle_dedicated_time(%s)\n", id, qptr->queue->qname)); /* * Keep track of the next pointer, and zero the queue's next ptr so * it looks like a single queue. */ next = qptr->next; qptr->next = NULL; ran = schd_handle_dedicated_time(qptr->queue); if (ran < 0) { (void)sprintf(log_buffer, "schd_handle_dedicated_time(%s@%s) failed!", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } else { DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname, qptr->queue->exechost)); total_ran += ran; } /* Replace the zero'd next pointer to rechain the list. */ qptr->next = next; } if (total_ran > 0) { (void)sprintf(log_buffer, "System resources after scheduling:"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); schd_dump_rsrclist(); } (void)sprintf(log_buffer, ">>> End Scheduling Cycle (ran %d jobs) <<<", total_ran); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s\n", log_buffer)); return (1);}static intschedule_jobs(QueueList *queues, Job *jobs, char *reason){ char *id = "schedule_jobs"; int numran; Job *job; Queue *shortest; int priority_to_1st = 1; /* * Since the sorting code has provided an order in which the jobs should * be run, attempt to honor that order by treating the first job on the * list as our first priority. This amounts to draining the queue in * order to run that job, if necessary. * * If the job has been waiting too long, find the smallest queue in which * the job will fit, and consider its expected run time. If the waiting * job cannot run when the queue has emptied, then go on to the next. * However, if there are jobs running on the queue, it is possible that * this queue could support the waiting job if it were started draining * now. When enough jobs had exited, the waiting job would be runnable. * In order to determine if this is true, walk through the list of jobs, * which are sorted in order of completion (from soonest to last), and * find how many resources would be available after that job finished. * If there is space, calculate what time it will be when that many jobs * have completed, and see if the primetime limits apply at that time. * If the job fits in the primetime limits at that time, then start the * queue draining. If it will not fit after all jobs have been tested, * then give up on this queue and go on to the next. * * If a queue was found that requires draining, mark it for draining. * * After the waiting job handling has completed, collect a list of * all the available execution queues, and place it into the pointer * given to this function by the caller. */ for (job = jobs; job != NULL; job = job->next) { if (job->state != 'Q') continue; if (!priority_to_1st && !(job->flags & JFLAGS_WAITING)) continue; DBPRT(("%s: job %s is %s (eligible for %s, needs %d nodes)\n", id, job->jobid, priority_to_1st ? "FIRSTJOB" : (job->flags & JFLAGS_PRIORITY) ? "SPECIAL" : "WAITING", schd_sec2val(job->eligible), job->nodes)); /* * Find the smallest, shortest-wait queue in which this job will * fit. If it is empty, great. If not, mark it to be drained, * in anticipation of the job being run soon. Note that the queue * drain_by time should only be shortened - it doesn't make sense * to push it out. */ shortest = schd_find_drain(queues, job); if (shortest) { /* * If there are no jobs running in the queue, then unset the * draining flag (if present), so that the queue will be * available for this job. * * If there are running jobs, set the draining flag, and * adjust the empty_by value to be the expected time when * the job will first become runnable. */ if (shortest->running == 0) { shortest->flags &= ~QFLAGS_DRAINING; } else { /* If running jobs, empty_by should be non-zero. */ if (shortest->drain_by <= shortest->empty_by) { shortest->flags |= QFLAGS_DRAINING; DBPRT(("%s: shortest queue %s now draining, drain_by %s", id, shortest->qname, ctime(&shortest->drain_by))); } } } /* * We have looked at (and possibly arranged for special treatment * of) the first job on the list. Now only look for special or * waiting jobs. */ priority_to_1st = 0; } numran = schd_pack_queues(jobs, queues, reason); if (numran < 0) { (void)sprintf(log_buffer, "sched_pack_queues() failed!"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } return (numran);}/* * Get information about each of the queues in the list of lists. If * schd_get_queue_limits() fails, return the error condition. It may * be a transient or a hard failure, which the caller may want to deal * with. If all queues are successful, return '0'. */static intget_all_queue_info(int numqlists, QueueList *list, ...){ va_list ap; int count = 0, ret; QueueList *qptr; char *id = "get_all_queue_info"; va_start(ap, numqlists); while (count < numqlists) { list = va_arg(ap, QueueList *); for (qptr = list; qptr != NULL; qptr = qptr->next) { /* * Get the limits, current resources, and any jobs for this * queue. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -