⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 errmgr_hnp.c

📁 MPI stands for the Message Passing Interface. Written by the MPI Forum (a large committee comprising
💻 C
字号:
/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana *                         University Research and Technology *                         Corporation.  All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University *                         of Tennessee Research Foundation.  All rights *                         reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,  *                         University of Stuttgart.  All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. *                         All rights reserved. * $COPYRIGHT$ *  * Additional copyrights may follow *  * $HEADER$ */#include "orte_config.h"#include "orte/orte_constants.h"#include <stdlib.h>#include <stdarg.h>#include "opal/class/opal_list.h"#include "opal/util/trace.h"#include "opal/util/output.h"#include "orte/runtime/runtime.h"#include "orte/runtime/params.h"#include "orte/mca/ns/ns_types.h"#include "orte/mca/gpr/gpr.h"#include "orte/mca/pls/pls.h"#include "orte/mca/smr/smr.h"#include "orte/mca/schema/schema.h"#include "orte/dss/dss.h"#include "orte/mca/rmgr/rmgr.h"#include "orte/mca/errmgr/base/base.h"#include "orte/mca/errmgr/hnp/errmgr_hnp.h"/* * This function gets called when the someone updates a process * state to indicate it has aborted. That action results in * the firing of a registry trigger that passes a minimal * data message here. The only part of that message we need * is the segment name so we can extract the jobid from it * * Various components will follow their own strategy for dealing with * this situation. For this component, we simply kill the job. */int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg){    orte_jobid_t job;    orte_vpid_t start, range;    orte_std_cntr_t num;    char *segment;    char *tokens[] = {        ORTE_JOB_GLOBALS,        NULL    };    orte_data_value_t dval = ORTE_DATA_VALUE_EMPTY;    opal_list_t attrs;    opal_list_item_t *item;    int rc;        OPAL_TRACE(1);        opal_output(orte_errmgr_base_output, "errmgr:hnp: proc abort has been detected");        /* This trigger is named, so we can extract the jobid     * directly from the trigger name     */    if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {        ORTE_ERROR_LOG(rc);        return rc;    }        /* set the job state */    if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_ABORTED))) {        ORTE_ERROR_LOG(rc);        return rc;    }        /* tell the pls to terminate the job AND ALL ITS DESCENDANTS */    OBJ_CONSTRUCT(&attrs, opal_list_t);    orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);    if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, &attrs))) {        ORTE_ERROR_LOG(rc);    }    while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);    OBJ_DESTRUCT(&attrs);        /* orterun will only wakeup when all procs report terminated. The terminate_job     * function *should* have done that - however, it is possible during abnormal     * startup that it will fail to happen. If we get here, we force the issue by     * deliberately causing the TERMINATE trigger to fire     */    if (ORTE_SUCCESS != (rc = orte_rmgr.get_vpid_range(job, &start, &range))) {        ORTE_ERROR_LOG(rc);        return rc;    }    if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {        ORTE_ERROR_LOG(rc);        return rc;    }    num = range;    if (ORTE_SUCCESS != (rc = orte_dss.set(&dval, (void*)&num, ORTE_STD_CNTR))) {        ORTE_ERROR_LOG(rc);        return rc;    }    if (ORTE_SUCCESS != (rc = orte_gpr.put_1(ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR,                                             segment, tokens, ORTE_PROC_NUM_TERMINATED, &dval))) {        ORTE_ERROR_LOG(rc);    }        return rc;}/* * This function gets called when someone updates a process * state to indicate it failed to start. That action results in * the firing of a registry trigger that passes a minimal * data message here. The only part of that message we need * is the segment name so we can extract the jobid from it * * Various components will follow their own strategy for dealing with * this situation. For this component, we simply kill the job. */int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg){    orte_jobid_t job;    int rc;        OPAL_TRACE(1);        /* This trigger is named, so we can extract the jobid     * directly from the trigger name     */    if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {        ORTE_ERROR_LOG(rc);        return rc;    }        /* set the job state */    if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_FAILED_TO_START))) {        ORTE_ERROR_LOG(rc);        return rc;    }        /* tell the pls to terminate the job - just kill this job, not any descendants since     * the job is just trying to start     */    if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, NULL))) {        ORTE_ERROR_LOG(rc);    }        return rc;}/* * This function gets called when the HNP itself detects an internal error! * Ideally, we would find some way to tell all the active jobs to die before * we depart ourselves. Unfortunately, at this time, we aren't sure we can do * this - later, we'll add some more intelligence by, for example, checking * the error code to see if it's something that would allow us to alert * the remote orteds. * * For now, we'll just depart! */void orte_errmgr_hnp_error_detected(int error_code, char *fmt, ...){    va_list arglist;        /* If there was a message, output it */        va_start(arglist, fmt);    if( NULL != fmt ) {        char* buffer = NULL;        vasprintf( &buffer, fmt, arglist );        opal_output( 0, buffer );        free( buffer );    }    va_end(arglist);    /* abnormal exit */    orte_abort(error_code, false);}/* * This function gets called when the HNP desperately needs to just die. * Nothing can be done by definition here - this function ONLY gets * called as an absolute last resort */void orte_errmgr_hnp_abort(void){    OPAL_TRACE(1);        /* abnormal exit */    orte_abort(-1, false);}/* * This function gets called when a process wants to request that the HNP * abort some set of processes for it. Since this component IS for the HNP, * that means we need to actually execute this request! Call upon the PLS * as needed to execute the abort requests */int orte_errmgr_hnp_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs){    int rc;        OPAL_TRACE(1);        rc = ORTE_SUCCESS;    return rc;}/* * Register the HNP's errmgr functions to be called when the job encounters * certain pre-identified problem states. * * NOTE: It is imperative that ONLY the HNP perform this registration! */int orte_errmgr_hnp_register_job(orte_jobid_t job){    /* we need to setup two counters and their corresponding triggers - one     * to alert us when something fails to launch, and another for when     * someone aborts     */    int rc;        OPAL_TRACE(1);        /* define the ABORT trigger to fire when any process aborts */    if (ORTE_SUCCESS != (rc = orte_smr.define_alert_monitor(job, ORTE_NUM_ABORTED_TRIGGER,                                                            ORTE_PROC_NUM_ABORTED, 0, 1, true,                                                            orte_errmgr_hnp_proc_aborted, NULL))) {        ORTE_ERROR_LOG(rc);        return rc;    }    /* define the FAILED_LAUNCH trigger to fire when the launch fails */    if (ORTE_SUCCESS != (rc = orte_smr.define_alert_monitor(job, ORTE_FAILED_TO_START_TRIGGER,                                                            ORTE_PROC_NUM_FAILED_START, 0, 1, true,                                                            orte_errmgr_hnp_incomplete_start, NULL))) {        ORTE_ERROR_LOG(rc);        return rc;    }    return ORTE_SUCCESS;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -