📄 launchprocess.cpp
字号:
delete arg; return; } if (strcmp(pszStr, "ERROR_SUCCESS")) { if (arg->i == 0 && !g_bNoMPI) { printf("Failed to launch the root process:\n%s\n%s\n", arg->pszCmdLine, pszStr);fflush(stdout); } else { printf("Failed to launch process %d:\n'%s'\n%s\n", arg->i, arg->pszCmdLine, pszStr);fflush(stdout); } sprintf(pszStr, "freeprocess %d", launchid); WriteString(sock, pszStr); ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } } } // Get the port number and redirect input to the first process if (arg->i == 0 && !g_bNoMPI) { /* // Check if the root process is alive sprintf(pszStr, "getexitcode %d", launchid); if (WriteString(sock, pszStr) == SOCKET_ERROR) { printf("Error: Unable to send a getexitcode command to '%s'\r\nError %d", arg->pszHost, WSAGetLastError());fflush(stdout); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } if (!ReadStringTimeout(sock, pszStr, g_nLaunchTimeout)) { printf("ERROR: Unable to read the result of the root getexitcode command on '%s': error %d", arg->pszHost, WSAGetLastError()); WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } if (stricmp(pszStr, "ACTIVE") != 0) { printf("ERROR: Root process has unexpectedly exited.\n"); WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } */ // barrier to let the root process do the put sprintf(pszStr, "barrier name=%s count=2", arg->pszJobID); if (WriteString(sock, pszStr) == SOCKET_ERROR) { printf("ERROR: Unable to write the barrier command: error %d", WSAGetLastError()); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } bool bBarrierLoopContinue = true; while (bBarrierLoopContinue) { if (!ReadStringTimeout(sock, pszStr, g_nLaunchTimeout)) { error = WSAGetLastError(); if (error != 0) { printf("ERROR: Unable to read the result of the barrier command on '%s': error %d", arg->pszHost, error); } else { if (bLocalStartup) { // check to see if the process is still running } else { sprintf(pszStr, "getexitcode %d", launchid); if (WriteString(sock, pszStr) == SOCKET_ERROR) { printf("Error: Unable to send a getexitcode command to '%s'\r\nError %d", arg->pszHost, WSAGetLastError());fflush(stdout); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } if (!ReadStringTimeout(sock, pszStr, g_nLaunchTimeout)) { printf("ERROR: Unable to read the result of the root getexitcode command on '%s': error %d", arg->pszHost, WSAGetLastError()); sprintf(pszStr, "freeprocess %d", launchid); WriteString(sock, pszStr); ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } if (stricmp(pszStr, "ACTIVE") == 0) { printf("ERROR: timed-out waiting for the root process to call MPI_Init\n"); if (g_bUseJobHost) { // Save this process's information to the job database PutJobProcessInDatabase(arg, nPid); } } else { printf("ERROR: The root process has unexpectedly exited.\n"); if (g_bUseJobHost) { sprintf(pszStr, "geterror %d", launchid); WriteString(sock, pszStr); pszStr[0] = '\0'; ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); // Save this process's information to the job database PutJobProcessInDatabase(arg, nPid); UpdateJobKeyValue(0, "error", pszStr); } sprintf(pszStr, "freeprocess %d", launchid); WriteString(sock, pszStr); ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } sprintf(pszStr, "freeprocess %d", launchid); WriteString(sock, pszStr); ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); } } WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } if (strncmp(pszStr, "SUCCESS", 8)) { if (strncmp(pszStr, "INFO", 4) == 0) { char *s; int id, x=0; s = strstr(pszStr, "id="); if (s) { s++;s++;s++; // move over 'id=' id = atoi(s); s = strstr(pszStr, "exitcode="); if (s) { s += strlen("exitcode="); // move over 'exitcode=' x = atoi(s); } if (id == launchid) { printf("ERROR: root process has unexpectedly exited. Exit code = %d\n", x); if (bLocalStartup) { sprintf(pszStr, "freeprocess %d", launchid); WriteString(sock, pszStr); ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); } WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } } } else { printf("ERROR: barrier failed on '%s':\n%s", arg->pszHost, pszStr); if (bLocalStartup) { sprintf(pszStr, "freeprocess %d", launchid); WriteString(sock, pszStr); ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); } WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } } else { bBarrierLoopContinue = false; } } // after the barrier, the data is available so do the get sprintf(pszStr, "dbget name=%s key=port", pszStartupDB); if (WriteString(sock, pszStr) == SOCKET_ERROR) { printf("ERROR: Unable to write '%s': error %d", pszStr, WSAGetLastError()); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } if (!ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT)) { printf("ERROR: Unable to get the root port: error %d", WSAGetLastError()); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } if (strncmp(pszStr, DBS_FAIL_STR, strlen(DBS_FAIL_STR)+1) == 0) { printf("ERROR: Unable to get the root port:\n%s", pszStr); sprintf(pszStr, "freeprocess %d", launchid); WriteString(sock, pszStr); ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); WriteString(sock, "done"); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } // save the retrieved data g_nRootPort = atoi(pszStr); // destroy the database since it is no longer necessary sprintf(pszStr, "dbdestroy name=%s", pszStartupDB); if (WriteString(sock, pszStr) == SOCKET_ERROR) { printf("ERROR: Unable to write '%s' to socket[%d]\n", pszStr, sock); //ExitProcess(0); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } // read result if (!ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT)) { printf("ERROR: ReadString failed to read the result of dbdestroy: error %d\n", WSAGetLastError()); //ExitProcess(0); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } if (strnicmp(pszStr, DBS_FAIL_STR, strlen(DBS_FAIL_STR)+1) == 0) { printf("Unable to destroy the database '%s' on '%s'\n%s", pszStartupDB, arg->pszHost, pszStr);fflush(stdout); sprintf(pszStr, "freeprocess %d", launchid); WriteString(sock, pszStr); ReadStringTimeout(sock, pszStr, g_nMPIRUN_SHORT_TIMEOUT); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } } if (g_bUseJobHost) { //printf("MPIRunLaunchProcess:putting job in database\n");fflush(stdout); // Save this process's information to the job database PutJobProcessInDatabase(arg, nPid); } // Wait for the process to exit if (bLocalStartup) { // send a simulated getexitcodewait command to the local process } else { sprintf(pszStr, "getexitcodewait %d", launchid); if (WriteString(sock, pszStr) == SOCKET_ERROR) { printf("Error: Unable to send a getexitcodewait command to '%s'\r\nError %d", arg->pszHost, WSAGetLastError());fflush(stdout); easy_closesocket(sock); SetEvent(g_hAbortEvent); delete arg; return; } //printf("getexitcodewait %d socket: 0x%p:%d\n", arg->i, sock, sock);fflush(stdout); } int i = InterlockedIncrement(&g_nNumProcessSockets) - 1; g_pProcessSocket[i] = sock; g_pProcessLaunchId[i] = launchid; g_pLaunchIdToRank[i] = arg->i; //printf("[[[[P:%d]]]]\n", launchid);fflush(stdout); } else { printf("MPIRunLaunchProcess: Connect to %s failed, error %d\n", arg->pszHost, error);fflush(stdout); //ExitProcess(0); SetEvent(g_hAbortEvent); delete arg; return; } memset(arg->pszPassword, 0, 100); delete arg;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -