📄 schedule.c
字号:
* Allocation and usage information are updated at [roughly] 2:00 AM * (Eastern time). Since they may have been updated, attempt to fetch * them again in the middle of the night. */ if (schd_NeedToGetDecayInfo) schd_decay_info("r"); /* get users' recent past usage */ if (schd_ENFORCE_ALLOCATION && schd_TimeNow >= schd_ENFORCE_ALLOCATION) { /* * If the allocations file has already been loaded, consult the file * timestamp to determine if it has changed. If so, flag that it * needs to be reloaded. */ if (!schd_NeedToGetAllocInfo && schd_AllocFilename) schd_NeedToGetAllocInfo = schd_file_has_changed(schd_AllocFilename, 1); if (!schd_NeedToGetYTDInfo && schd_CurrentFilename) schd_NeedToGetYTDInfo = schd_file_has_changed(schd_CurrentFilename, 1); /* If either file needs to be [re]loaded, do so. */ if (schd_NeedToGetAllocInfo || schd_NeedToGetYTDInfo) schd_alloc_info(); } /* * We need to save the past usage data periodically, so that a restart * of pbs_sched doesn't lose it ... */ if (schd_save_decay()) /* is it time yet ? */ schd_decay_info("w"); /* yep, so do it */ if (schd_SubmitQueue->queue->jobs && !(schd_SubmitQueue->queue->flags & (QFLAGS_DISABLED | QFLAGS_STOPPED))) { /* * Test each job against the set of execution queues. If it can * never be run in any queue, reject it immediately. This saves * the user having to wait for the scheduler to get around to being * able to run it. */ jobs = reject_unrunnables(schd_SubmitQueue->queue->jobs); /* * Look for queues whose execution hosts are in dedicated time. If * any are found, note that fact and continue. Otherwise, add them * to the normalQs list, which will be scheduled normally. If the * flag is set indicating that one or more hosts is in dedtime, they * will be scheduled after everything else is done. */ for (qptr = schd_BatchQueues; qptr != NULL; qptr = qptr->next) { if (schd_ENFORCE_DEDTIME && schd_TimeNow >= schd_ENFORCE_DEDTIME) outages = schd_host_outage(qptr->queue->exechost, 0); else outages = NULL; /* * Is there a scheduled outage right now for this host? If so, * note that fact and continue to the next queue. All of this * information is cached, so this isn't as expensive as it seems. */ if (outages != NULL) { if ((outages->beg_time <= schd_TimeNow) && (outages->end_time > schd_TimeNow)) { DBPRT(("%s: Host %s is in dedtime (from %s:%s to %s:%s)\n", id, outages->exechost, outages->beg_datestr, outages->beg_timestr, outages->end_datestr, outages->end_timestr)); DBPRT(("%s: Queue %s@%s will not be scheduled.\n", id, qptr->queue->qname, qptr->queue->exechost)); /* This exechost is in dedicated time, ignore the queue. */ hosts_in_dedtime ++; continue; } else if (outages->beg_time > schd_TimeNow) { /* Upcoming dedtime, but not yet. Schedule the queue. */ DBPRT(("%s: Host %s upcoming dedtime (at %s:%s to %s:%s)\n", id, outages->exechost, outages->beg_datestr, outages->beg_timestr, outages->end_datestr, outages->end_timestr)); } } /* * This host is not currently in dedicated time. Add it to the * tail of the list of queues to be scheduled. */ newqlp = (QueueList *)malloc(sizeof(QueueList)); if (newqlp == NULL) { (void)sprintf(log_buffer, "malloc(QueueList) for %s@%s failed", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); if (normalQs) schd_free_qlist(normalQs); return (1); } newqlp->queue = qptr->queue; if (normalQtail) normalQtail->next = newqlp; else normalQs = newqlp; normalQtail = newqlp; newqlp->next = NULL; } DBPRT(("%s: calling schedule_jobs(", id)); if (normalQs) { for (qptr = normalQs; qptr != NULL; qptr = qptr->next) DBPRT(("%s@%s%s", qptr->queue->qname, qptr->queue->exechost, qptr->next ? ", " : "")); } else { DBPRT(("<no batch queues>")); } DBPRT((")\n")); total_ran += ran = schedule_jobs(normalQs, jobs, reason); if (ran < 0) { DBPRT(("Could not run any jobs!\n")); } else { DBPRT(("RAN %d jobs.\n", ran)); } if (normalQs) schd_free_qlist(normalQs); normalQs = normalQtail = NULL; } /* * If there are any externally-routed queues, schedule any jobs * that are enqueued in them. */ for (qptr = schd_ExternQueues; qptr != NULL; qptr = qptr->next) { if (qptr->queue->queued == 0) continue; (void)sprintf(log_buffer, "Scheduling external queue %s@%s ...", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); /* * Keep track of the next pointer. Zero it so that each queue * looks like a single queue to schd_pack_queues(). */ next = qptr->next; qptr->next = NULL; ran = schd_pack_queues(qptr->queue->jobs, qptr, reason); if (ran < 0) { (void)sprintf(log_buffer, "sched_pack_queues(%s@%s) failed!", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } else { DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname, qptr->queue->exechost)); total_ran += ran; } /* Replace the zero'd next pointer to rechain the list. */ qptr->next = next; } /* * Now check the dedtime queues with queued jobs for hosts that are * in dedicated time. If any are found, comment the jobs appropriately * and/or schedule them. */ for (qptr = schd_DedQueues; qptr != NULL; qptr = qptr->next) { if (qptr->queue->queued == 0) continue; DBPRT(("%s: schd_handle_dedicated_time(%s)\n", id, qptr->queue->qname)); /* * Keep track of the next pointer, and zero the queue's next ptr so * it looks like a single queue. */ next = qptr->next; qptr->next = NULL; ran = schd_handle_dedicated_time(qptr->queue); if (ran < 0) { (void)sprintf(log_buffer, "schd_handle_dedicated_time(%s@%s) failed!", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } else { DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname, qptr->queue->exechost)); total_ran += ran; } /* Replace the zero'd next pointer to rechain the list. */ qptr->next = next; } /* * Attempt to revoke any unused HPM counters that are still in user * mode. Returns number of errors encountered. This should be zero * for a healthy system. */ if (schd_MANAGE_HPM) { if (schd_revoke_hpm()) { (void)sprintf(log_buffer, "Failed to revoke unused HPM counters!"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s\n", log_buffer)); } } if (total_ran > 0) { (void)sprintf(log_buffer, "System resources after scheduling:"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); schd_dump_rsrclist(); } (void)sprintf(log_buffer, ">>> End Scheduling Cycle (ran %d jobs) <<<", total_ran); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s\n", log_buffer)); return (1);}static intschedule_jobs(QueueList *queues, Job *jobs, char *reason){ char *id = "schedule_jobs"; int numran; Job *job; Queue *shortest; int priority_to_1st = 1; /* * Since the sorting code has provided an order in which the jobs should * be run, attempt to honor that order by treating the first job on the * list as our first priority. This amounts to draining the queue in * order to run that job, if necessary. * * If the job has been waiting too long, find the smallest queue in which * the job will fit, and consider its expected run time. If the waiting * job cannot run when the queue has emptied, then go on to the next. * However, if there are jobs running on the queue, it is possible that * this queue could support the waiting job if it were started draining * now. When enough jobs had exited, the waiting job would be runnable. * In order to determine if this is true, walk through the list of jobs, * which are sorted in order of completion (from soonest to last), and * find how many resources would be available after that job finished. * If there is space, calculate what time it will be when that many jobs * have completed, and see if the primetime limits apply at that time. * If the job fits in the primetime limits at that time, then start the * queue draining. If it will not fit after all jobs have been tested, * then give up on this queue and go on to the next. * * If a queue was found that requires draining, mark it for draining. * * After the waiting job handling has completed, collect a list of * all the available execution queues, and place it into the pointer * given to this function by the caller. */ for (job = jobs; job != NULL; job = job->next) { if (job->state != 'Q') continue; if (!priority_to_1st && !(job->flags & JFLAGS_WAITING)) continue; DBPRT(("%s: job %s is %s (eligible for %s, needs %d nodes)\n", id, job->jobid, priority_to_1st ? "FIRSTJOB" : (job->flags & JFLAGS_PRIORITY) ? "SPECIAL" : "WAITING", schd_sec2val(job->eligible), job->nodes)); /* * Find the smallest, shortest-wait queue in which this job will * fit. If it is empty, great. If not, mark it to be drained, * in anticipation of the job being run soon. Note that the queue * drain_by time should only be shortened - it doesn't make sense * to push it out. */ shortest = schd_find_drain(queues, job); if (shortest) { /* * If there are no jobs running in the queue, then unset the * draining flag (if present), so that the queue will be * available for this job. * * If there are running jobs, set the draining flag, and * adjust the empty_by value to be the expected time when * the job will first become runnable. */ if (shortest->running == 0) { shortest->flags &= ~QFLAGS_DRAINING; } else { /* If running jobs, empty_by should be non-zero. */ if (shortest->drain_by <= shortest->empty_by) { shortest->flags |= QFLAGS_DRAINING; DBPRT(("%s: shortest queue %s now draining, drain_by %s", id, shortest->qname, ctime(&shortest->drain_by))); } } } /* * We have looked at (and possibly arranged for special treatment * of) the first job on the list. Now only look for special or * waiting jobs. */ priority_to_1st = 0; } numran = schd_pack_queues(jobs, queues, reason); if (numran < 0) { (void)sprintf(log_buffer, "sched_pack_queues() failed!"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } return (numran);}/* * Get information about each of the queues in the list of lists. If * schd_get_queue_limits() fails, return the error condition. It may * be a transient or a hard failure, which the caller may want to deal * with. If all queues are successful, return '0'. */static intget_all_queue_info(int numqlists, ...){ va_list ap; int count = 0, ret; QueueList *list; QueueList *qptr; va_start(ap, numqlists); while (count < numqlists) { list = va_arg(ap, QueueList *); for (qptr = list; qptr != NULL; qptr = qptr->next) { /* * Get the limits, current resources, and any jobs for this * queue. */ if ((ret = schd_get_queue_limits(qptr->queue)) != 0) { DBPRT(("get_all_queue_info: get_queue_limits for %s failed.\n", qptr->queue->qname)); va_end(ap); return (ret); } /* * Set the queue flags if limits are exceeded. Don't bother * getting a reason string. */ schd_check_queue_limits(qptr->queue, NULL); } count ++; } va_end(ap); return (0);}/* * Jobs queued on the special queue should be treated as highest priority. * They are sorted onto the top of the list of jobs that is created in the * usersort.c code. That sorted list is then split out onto each of the * queues, so that each queue has a list of the jobs it "owns". The jobs * then carry a backpointer to their owner queue. * * This works really nicely, since all the information about each queue * (including the list of jobs queued/running/etc on it) lives right on * the Queue structure. It is a clean, elegant and fully general solution. * * An unfortunate side effect of this "demultiplexing" is that jobs that * were marked "special" end up claimed by the SpecialQueue. The scheduler * looks for jobs only on the SubmitQueue queue, so it never notices that * there are special jobs enqueued. In order to address this, any jobs on * the SpecialQueue are marked "waiting/high priority", and placed at the * head of the list of jobs in the SubmitQueue. * * This seems like an evil hack at first, and it arguably is. However, if * there are multiple submission queues, it is relatively simple to support * them by simply causing them to be inserted in the submit queue's list. */static intfixup_special(void){ char *id = "fixup_special"; Job *job, *submitjobs, *nextjob, *specialtail; Queue *queue; char comment[MAX_TXT + 1]; int fixedup = 0, justcomment = 0; /* * Sanity check -- this function shouldn't be called if there is no * valid SpecialQueue. */ if (schd_SpecialQueue == NULL || schd_SpecialQueue->queue == NULL) { DBPRT(("%s: special code called but no special queue defined!\n", id)); return (-1); } queue = schd_SpecialQueue->queue; if (queue->jobs == NULL) { DBPRT(("%s: no jobs on special queue '%s'. Ignoring.\n", id, queue->qname)); return (0); } /* * See if the special queue has anything to do, and if it will allow * anything to be done to it. */ if (queue->flags & QFLAGS_DISABLED) { (void)sprintf(comment, "Queue %s not enabled", queue->qname); justcomment ++; } if (queue->flags & QFLAGS_STOPPED) { (void)sprintf(comment, "Queue %s not started", queue->qname); justcomment ++; } /* * If the jobs on the speical queue should just be commented, do so and * return 0 -- no jobs were fixed up. */ if (justcomment) { for (job = queue->jobs; job != NULL; job = job->next) schd_comment_job(job, comment, JOB_COMMENT_REQUIRED); return 0; } /* * Detach the list of jobs from the SubmitQueue. They will be tacked * back onto the end of the list once the special jobs have been moved * to the head. */ submitjobs = schd_SubmitQueue->queue->jobs;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -