⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 orterun.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
📖 第 1 页 / 共 4 页
字号:
        }                /* Wait for the app to complete */        if (wait_for_job_completion) {            OPAL_THREAD_LOCK(&orterun_globals.lock);            while (!orterun_globals.exit) {                opal_condition_wait(&orterun_globals.cond,                                    &orterun_globals.lock);            }            /* check to see if the job was aborted */            if (ORTE_JOBID_INVALID != jobid &&                ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) {                if (ORTE_SUCCESS != rc) {                    ORTE_ERROR_LOG(rc);                }                /* define the exit state as abnormal by default */                exit_state = ORTE_JOB_STATE_ABORTED;            }            if (ORTE_JOB_STATE_TERMINATED != exit_state) {                /* abnormal termination of some kind */                dump_aborted_procs(jobid);                /* If we showed more abort messages than were allowed,                show a followup message here */                if (num_aborted > max_display_aborted) {                    i = num_aborted - max_display_aborted;                    printf("%d additional process%s aborted (not shown)\n",                           i, ((i > 1) ? "es" : ""));                }                if (num_killed > 0) {                    printf("%d process%s killed (possibly by Open MPI)\n",                           num_killed, ((num_killed > 1) ? "es" : ""));                }            }            /* Make sure we propagate the exit code */            if (WIFEXITED(orterun_globals.exit_status)) {                rc = WEXITSTATUS(orterun_globals.exit_status);            } else {                /* If a process was killed by a signal, then make the                 * exit code of orterun be "signo + 128" so that "prog"                 * and "orterun prog" will both set the same status                 * value for the shell */                rc = WTERMSIG(orterun_globals.exit_status) + 128;            }                        /* the job is complete - now tell the orteds that it is             * okay to finalize and exit, we are done with them             * be sure to include any descendants so nothing is             * left hanging             */            if (ORTE_JOBID_INVALID != jobid) {                OBJ_CONSTRUCT(&attributes, opal_list_t);                orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);                if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {                    opal_show_help("help-orterun.txt", "orterun:daemon-die", true,                                   orterun_basename, ORTE_ERROR_NAME(ret));                }                while (NULL != (item = opal_list_remove_first(&attributes))) {                    OBJ_RELEASE(item);                }                OBJ_DESTRUCT(&attributes);            }            OPAL_THREAD_UNLOCK(&orterun_globals.lock);            /* If we were forcibly killed, print a warning that the               user may still have some manual cleanup to do. */            if (ORTE_JOBID_INVALID == jobid) {                opal_show_help("help-orterun.txt", "orterun:abnormal-exit",                               true, orterun_basename, orterun_basename);            }        }    }DONE:    for (i = 0; i < num_apps; ++i) {        OBJ_RELEASE(apps[i]);    }    free(apps);    OBJ_RELEASE(apps_pa);        orte_finalize();    free(orterun_basename);    return rc;}/* * On abnormal termination - dump the * exit status of the aborted procs. */static void dump_aborted_procs(orte_jobid_t jobid){    char *segment;    orte_gpr_value_t** values = NULL;    orte_std_cntr_t i, k, num_values = 0;    int rc;    int32_t exit_status = 0;    bool exit_status_set;    char *keys[] = {        ORTE_PROC_NAME_KEY,        ORTE_PROC_LOCAL_PID_KEY,        ORTE_PROC_RANK_KEY,        ORTE_PROC_EXIT_CODE_KEY,        ORTE_NODE_NAME_KEY,        NULL    };    OPAL_TRACE_ARG1(1, jobid);    /* query the job segment on the registry */    if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {        ORTE_ERROR_LOG(rc);        return;    }    rc = orte_gpr.get(        ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,        segment,        NULL,        keys,        &num_values,        &values        );    if(rc != ORTE_SUCCESS) {        ORTE_ERROR_LOG(rc);        free(segment);        return;    }    for (i = 0; i < num_values; i++) {        orte_gpr_value_t* value = values[i];        orte_process_name_t name, *nptr;        pid_t pid = 0, *pidptr;        orte_std_cntr_t rank = 0, *sptr;        bool rank_found=false;        char* node_name = NULL;        orte_exit_code_t *ecptr;        exit_status = 0;        exit_status_set = false;        for(k=0; k < value->cnt; k++) {            orte_gpr_keyval_t* keyval = value->keyvals[k];            if(strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) {                if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&nptr, keyval->value, ORTE_NAME))) {                    ORTE_ERROR_LOG(rc);                    continue;                }                name = *nptr;                continue;            }            if(strcmp(keyval->key, ORTE_PROC_LOCAL_PID_KEY) == 0) {                if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pidptr, keyval->value, ORTE_PID))) {                    ORTE_ERROR_LOG(rc);                    continue;                }                pid = *pidptr;                continue;            }            if(strcmp(keyval->key, ORTE_PROC_RANK_KEY) == 0) {                if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) {                    ORTE_ERROR_LOG(rc);                    continue;                }                rank_found = true;                rank = *sptr;                continue;            }            if(strcmp(keyval->key, ORTE_PROC_EXIT_CODE_KEY) == 0) {                if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&ecptr, keyval->value, ORTE_EXIT_CODE))) {                    ORTE_ERROR_LOG(rc);                    continue;                }                exit_status = *ecptr;                exit_status_set = true;                continue;            }            if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {                node_name = (char*)(keyval->value->data);                continue;            }        }        if (rank_found) {            if (WIFSIGNALED(exit_status)) {                if (9 == WTERMSIG(exit_status)) {                    ++num_killed;                } else {                    if (num_aborted < max_display_aborted) {#ifdef HAVE_STRSIGNAL                        if (NULL != strsignal(WTERMSIG(exit_status))) {                            opal_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", false,                                       orterun_basename, (unsigned long)rank, (unsigned long)pid,                                       node_name, WTERMSIG(exit_status),                                        strsignal(WTERMSIG(exit_status)));                        } else {#endif                            opal_show_help("help-orterun.txt", "orterun:proc-aborted", false,                                       orterun_basename, (unsigned long)rank, (unsigned long)pid,                                       node_name, WTERMSIG(exit_status));#ifdef HAVE_STRSIGNAL                        }#endif                    }                    ++num_aborted;                }            }        }        /* If we haven't done so already, hold the exit_status so we           can return it when exiting.  Specifically, keep the first           non-zero entry.  If they all return zero, we'll return           zero.  We already have the globals.lock (from           job_state_callback), so don't try to get it again. */        if (0 == orterun_globals.exit_status && exit_status_set) {            orterun_globals.exit_status = exit_status;        }        OBJ_RELEASE(value);    }    if (NULL != values) {        free(values);    }    free(segment);}/* * signal main thread when application completes */static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state){    OPAL_TRACE_ARG2(1, jobid, state);    OPAL_THREAD_LOCK(&orterun_globals.lock);    /* Note that there's only three states that we're interested in       here:       TERMINATED: which means that all the processes in the job have                completed (normally and/or abnormally).       AT_STG1: which means that everyone has hit stage gate 1, so we                can do the parallel debugger startup stuff.       Remember that the rmgr itself will also be called for the       ABORTED state and call the pls.terminate_job, which will result       in killing all the other processes. */    if (orte_debug_flag) {        opal_output(0, "spawn: in job_state_callback(jobid = %d, state = 0x%x)\n",                    jobid, state);    }    switch(state) {        case ORTE_PROC_STATE_TERMINATED:            orterun_globals.exit_status = 0;  /* set the exit status to indicate normal termination */            orterun_globals.exit = true;            opal_condition_signal(&orterun_globals.cond);            break;        case ORTE_PROC_STATE_AT_STG1:            orte_totalview_init_after_spawn(jobid);            break;        default:            opal_output(0, "orterun: job state callback in unexpected state - jobid %lu, state 0x%04x\n", jobid, state);            break;    }    OPAL_THREAD_UNLOCK(&orterun_globals.lock);}/* * Fail-safe in the event the job hangs and doesn't * cleanup correctly. */static void exit_callback(int fd, short event, void *arg){    OPAL_TRACE(1);    /* Remove the TERM and INT signal handlers */    opal_signal_del(&term_handler);    opal_signal_del(&int_handler);#ifndef __WINDOWS__    /** Remove the USR signal handlers */    opal_signal_del(&sigusr1_handler);    opal_signal_del(&sigusr2_handler);#endif  /* __WINDOWS__ */    /* Trigger the normal exit conditions */    orterun_globals.exit = true;    orterun_globals.exit_status = 1;    opal_condition_signal(&orterun_globals.cond);}/* * Attempt to terminate the job and wait for callback indicating * the job has been aborted. */typedef enum {    ABORT_SIGNAL_FIRST,    ABORT_SIGNAL_PROCESSING,    ABORT_SIGNAL_WARNED,    ABORT_SIGNAL_DONE} abort_signal_state_t;static void abort_signal_callback(int fd, short flags, void *arg){    int ret;    opal_event_t* event;    opal_list_t attrs;    opal_list_item_t *item;    static abort_signal_state_t state=ABORT_SIGNAL_FIRST;    static struct timeval invoked, now;    double a, b;        OPAL_TRACE(1);        /* If this whole process has already completed, then bail */    switch (state) {    case ABORT_SIGNAL_FIRST:        /* This is the first time through */        state = ABORT_SIGNAL_PROCESSING;        break;                case ABORT_SIGNAL_WARNED:        gettimeofday(&now, NULL);        a = invoked.tv_sec * 1000000 + invoked.tv_usec;        b = now.tv_sec * 1000000 + invoked.tv_usec;        if (b - a <= 1000000) {            if (!orterun_globals.quiet){                fprintf(stderr, "%s: forcibly killing job...\n",                         orterun_basename);            }            /* tell the pls to cancel the terminate request -             * obviously, something is wrong at this point             */            if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {                ORTE_ERROR_LOG(ret);            }                        /* We are in an event handler; exit_callback() will delete               the handler that is currently running (which is a Bad               Thing), so we can't call it directly.  Instead, we have               to exit this handler and setup to call exit_handler()               after this. */            if (NULL != (event = (opal_event_t*)                         malloc(sizeof(opal_event_t)))) {                opal_evtimer_set(event, exit_callback, NULL);                now.tv_sec = 0;                now.tv_usec = 0;                opal_evtimer_add(event, &now);                state = ABORT_SIGNAL_DONE;            }            return;        }         /* Otherwise fall through to PROCESSING and warn again */                    case ABORT_SIGNAL_PROCESSING:        opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",                       true, orterun_basename, orterun_basename,                        orterun_basename);        gettimeofday(&invoked, NULL);        state = ABORT_SIGNAL_WARNED;        return;            case ABORT_SIGNAL_DONE:        /* Nothing to do -- return */        return;    }    if (!orterun_globals.quiet){        fprintf(stderr, "%s: killing job...\n\n", orterun_basename);    }        /* terminate the job - this will also wakeup orterun so     * it can kill all the orteds. Be sure to kill all the job's     * descendants, if any, so nothing is left hanging     */    if (jobid != ORTE_JOBID_INVALID) {        OBJ_CONSTRUCT(&attrs, opal_list_t);        orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);        ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs);        while (NULL != (item = opal_list_remove_first(&attrs))) {            OBJ_RELEASE(item);        }        OBJ_DESTRUCT(&attrs);        if (ORTE_SUCCESS != ret) {            /* If we failed the terminate_job() above, then the               condition variable in the main loop in orterun won't               wake up.  So signal it. */            if (NULL != (event = (opal_event_t*)                         malloc(sizeof(opal_event_t)))) {                opal_evtimer_set(event, exit_callback, NULL);                now.tv_sec = 0;                now.tv_usec = 0;                opal_evtimer_add(event, &now);            } else {                /* We really don't want to do this, but everything                   else has failed... */                orterun_globals.exit = true;                orterun_globals.exit_status = 1;                opal_condition_signal(&orterun_globals.cond);            }            jobid = ORTE_JOBID_INVALID;        }    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -