📄 pls_tm_module.c
字号:
mca_pls_tm_component.verbose) { opal_output(0, "pls:tm: launching on node %s", node->nodename); } /* setup process name */ rc = orte_ns.get_proc_name_string(&name_string, name); if (ORTE_SUCCESS != rc) { opal_output(0, "pls:tm: unable to create process name"); return rc; } free(argv[proc_name_index]); argv[proc_name_index] = strdup(name_string); /* exec the daemon */ if (mca_pls_tm_component.debug) { param = opal_argv_join(argv, ' '); if (NULL != param) { opal_output(0, "pls:tm: executing: %s", param); free(param); } } /* check for timing request - get start time if so */ if (mca_pls_tm_component.timing) { if (0 != gettimeofday(&launchstart, NULL)) { opal_output(0, "pls_tm: could not obtain start time"); launchstart.tv_sec = 0; launchstart.tv_usec = 0; } } rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched); if (TM_SUCCESS != rc) { return ORTE_ERROR; } if (ORTE_SUCCESS != rc) { opal_output(0, "pls:tm: start_procs returned error %d", rc); goto cleanup; } /* check for timing request - get stop time and process if so */ if (mca_pls_tm_component.timing) { if (0 != gettimeofday(&launchstop, NULL)) { opal_output(0, "pls_tm: could not obtain stop time"); } else { deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 + (launchstop.tv_usec - launchstart.tv_usec); avgtime = avgtime + deltat / num_nodes; if (deltat < mintime) { mintime = deltat; miniter = launched; } if (deltat > maxtime) { maxtime = deltat; maxiter = launched; } } } launched++; ++vpid; free(name); /* Allow some progress to occur */ opal_event_loop(OPAL_EVLOOP_NONBLOCK); } if (mca_pls_tm_component.debug) { opal_output(0, "pls:tm:launch: finished spawning orteds\n"); } /* check for timing request - get start time for launch completion */ if (mca_pls_tm_component.timing) { if (0 != gettimeofday(&completionstart, NULL)) { opal_output(0, "pls_tm: could not obtain completion start time"); completionstart.tv_sec = 0; completionstart.tv_usec = 0; } } /* all done, so store the daemon info on the registry */ if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) { ORTE_ERROR_LOG(rc); } /* TM poll for all the spawns */ for (i = 0; i < launched; ++i) { rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err); if (TM_SUCCESS != rc) { errno = local_err; opal_output(0, "pls:tm: failed to poll for a spawned proc, return status = %d", rc); return ORTE_ERR_IN_ERRNO; } } /* check for timing request - get stop time for launch completion and report */ if (mca_pls_tm_component.timing) { if (0 != gettimeofday(&completionstop, NULL)) { opal_output(0, "pls_tm: could not obtain completion stop time"); } else { deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 + (launchstop.tv_usec - launchstart.tv_usec); opal_output(0, "pls_tm: launch completion required %d usec", deltat); } opal_output(0, "pls_tm: Launch statistics:"); opal_output(0, "pls_tm: Average time to launch an orted: %f usec", avgtime); opal_output(0, "pls_tm: Max time to launch an orted: %d usec at iter %d", maxtime, maxiter); opal_output(0, "pls_tm: Min time to launch an orted: %d usec at iter %d", mintime, miniter); } cleanup: OBJ_RELEASE(map); if (connected) { pls_tm_disconnect(); } if (NULL != tm_events) { free(tm_events); } if (NULL != tm_task_ids) { free(tm_task_ids); } if (NULL != lib_base) { free(lib_base); } if (NULL != bin_base) { free(bin_base); } /* deconstruct the daemon list */ while (NULL != (item = opal_list_remove_first(&daemons))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&daemons); /* check for timing request - get stop time and process if so */ if (mca_pls_tm_component.timing) { if (0 != gettimeofday(&jobstop, NULL)) { opal_output(0, "pls_tm: could not obtain stop time"); } else { deltat = (jobstop.tv_sec - jobstart.tv_sec)*1000000 + (jobstop.tv_usec - jobstart.tv_usec); opal_output(0, "pls_tm: launch of entire job required %d usec", deltat); } } if (mca_pls_tm_component.debug) { opal_output(0, "pls:tm:launch: finished\n"); } return rc;}static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs){ int rc; opal_list_t daemons; opal_list_item_t *item; /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* order them to kill their local procs for this job */ if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(&daemons, jobid, timeout))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } CLEANUP: while (NULL != (item = opal_list_remove_first(&daemons))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&daemons); return rc;}/** * Terminate the orteds for a given job */int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs){ int rc; opal_list_t daemons; opal_list_item_t *item; /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* now tell them to die! */ if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) { ORTE_ERROR_LOG(rc); } CLEANUP: while (NULL != (item = opal_list_remove_first(&daemons))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&daemons); return rc;}/* * TM can't kill individual processes -- PBS will kill the entire job */static int pls_tm_terminate_proc(const orte_process_name_t *name){ if (mca_pls_tm_component.debug) { opal_output(0, "pls:tm:terminate_proc: not supported"); } return ORTE_ERR_NOT_SUPPORTED;}static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs){ int rc; opal_list_t daemons; opal_list_item_t *item; /* construct the list of active daemons on this job */ OBJ_CONSTRUCT(&daemons, opal_list_t); if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&daemons); return rc; } /* order them to pass this signal to their local procs */ if (ORTE_SUCCESS != (rc = orte_pls_base_orted_signal_local_procs(&daemons, signal))) { ORTE_ERROR_LOG(rc); } while (NULL != (item = opal_list_remove_first(&daemons))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&daemons); return rc;}static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal){ return ORTE_ERR_NOT_IMPLEMENTED;}/** * Cancel an operation involving comm to an orted */static int pls_tm_cancel_operation(void){ int rc; if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) { ORTE_ERROR_LOG(rc); } return rc;}/* * Free stuff */static int pls_tm_finalize(void){ int rc; /* cleanup any pending recvs */ if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) { ORTE_ERROR_LOG(rc); } return ORTE_SUCCESS;}static int pls_tm_connect(void){ int ret; struct tm_roots tm_root; int count, progress; /* try a couple times to connect - might get busy signals every now and then */ for (count = 0 ; count < 10; ++count) { ret = tm_init(NULL, &tm_root); if (TM_SUCCESS == ret) { return ORTE_SUCCESS; } for (progress = 0 ; progress < 10 ; ++progress) { opal_progress();#if HAVE_SCHED_YIELD sched_yield();#endif } } return ORTE_ERR_RESOURCE_BUSY;}static int pls_tm_disconnect(void){ tm_finalize(); return ORTE_SUCCESS;}static int pls_tm_check_path(char *exe, char **env){ static int size = 256; int i; char *file; char *cwd; char *path = NULL; /* Do we want this check at all? */ if (!mca_pls_tm_component.want_path_check) { return ORTE_SUCCESS; } /* Find the path in the supplied environment */ for (i = 0; NULL != env[i]; ++i) { if (0 == strncmp("PATH=", env[i], 5)) { path = strdup(env[i]); break; } } if (NULL == env[i]) { path = strdup("NULL"); } /* Check the already-successful paths (i.e., be a little friendlier to the filesystem -- if we find the executable successfully, save it) */ for (i = 0; NULL != mca_pls_tm_component.checked_paths && NULL != mca_pls_tm_component.checked_paths[i]; ++i) { if (0 == strcmp(path, mca_pls_tm_component.checked_paths[i])) { return ORTE_SUCCESS; } } /* We didn't already find it, so check now. First, get the cwd. */ do { cwd = malloc(size); if (NULL == cwd) { return ORTE_ERR_OUT_OF_RESOURCE; } if (NULL == getcwd(cwd, size)) { free(cwd); if (ERANGE == errno) { size *= 2; } else { return ORTE_ERR_IN_ERRNO; } } else { break; } } while (1); /* Now do the search */ file = opal_path_findv(exe, X_OK, env, cwd); free(cwd); if (NULL == file) { free(path); return ORTE_ERR_NOT_FOUND; } if (mca_pls_tm_component.debug) { opal_output(0, "pls:tm: found %s", file); } free(file); /* Success -- so cache it */ opal_argv_append_nosize(&mca_pls_tm_component.checked_paths, path); /* All done */ free(path); return ORTE_SUCCESS;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -