📄 ad_write_coll.c
字号:
recv_size = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* total size of data to be recd. from each proc. in an iteration.*/ sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* amount of data sent to each proc so far. Used in ADIOI_Fill_send_buffer. initialized to 0 here. */ send_buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int)); curr_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); done_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* Above three are used in ADIOI_Fill_send_buffer*/ start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* used to store the starting value of curr_offlen_ptr[i] in this iteration */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (!buftype_is_contig) { ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; } MPI_Type_extent(datatype, &buftype_extent);/* I need to check if there are any outstanding nonblocking writes to the file, which could potentially interfere with the writes taking place in this collective write call. Since this is not likely to be common, let me do the simplest thing possible here: Each process completes all pending nonblocking operations before completing. */ ADIOI_Complete_async(error_code); if (*error_code != MPI_SUCCESS) return; MPI_Barrier(fd->comm); done = 0; off = st_loc;#ifdef PROFILE MPE_Log_event(14, 0, "end computation");#endif for (m=0; m < ntimes; m++) { /* go through all others_req and check which will be satisfied by the current write */ /* Note that MPI guarantees that displacements in filetypes are in monotonically nondecreasing order and that, for writes, the filetypes cannot specify overlapping regions in the file. This simplifies implementation a bit compared to reads. */ /* off = start offset in the file for the data to be written in this iteration size = size of data written (bytes) corresponding to off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ /* first calculate what should be communicated */#ifdef PROFILE MPE_Log_event(13, 0, "start computation");#endif for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0; size = (int) (ADIOI_MIN(coll_bufsize, end_loc-st_loc+1-done)); for (i=0; i < nprocs; i++) { if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_recv[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_recv[i]; req_len = others_req[i].lens[j] - partial_recv[i]; partial_recv[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < off + size) { count[i]++; MPI_Address(write_buf+req_off-off, &(others_req[i].mem_ptrs[j])); recv_size[i] += (int)(ADIOI_MIN(off + (ADIO_Offset)size - req_off, req_len)); if (off+size-req_off < req_len) { partial_recv[i] = (int) (off + size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < off+size)) { FPRINTF(stderr, "Error: the filetype specifies an overlapping write\n"); MPI_Abort(MPI_COMM_WORLD, 1); } break; } } else break; } curr_offlen_ptr[i] = j; } } #ifdef PROFILE MPE_Log_event(14, 0, "end computation"); MPE_Log_event(7, 0, "start communication");#endif ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, off, size, count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, done_to_proc, &hole, m, buftype_extent, buf_idx, error_code); if (*error_code != MPI_SUCCESS) return;#ifdef PROFILE MPE_Log_event(8, 0, "end communication");#endif flag = 0; for (i=0; i<nprocs; i++) if (count[i]) flag = 1; if (flag) { ADIO_WriteContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); if (*error_code != MPI_SUCCESS) return; } off += size; done += size; } for (i=0; i<nprocs; i++) count[i] = recv_size[i] = 0;#ifdef PROFILE MPE_Log_event(7, 0, "start communication");#endif for (m=ntimes; m<max_ntimes; m++) /* nothing to recv, but check for send. */ ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, off, size, count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, done_to_proc, &hole, m, buftype_extent, buf_idx, error_code); if (*error_code != MPI_SUCCESS) return;#ifdef PROFILE MPE_Log_event(8, 0, "end communication");#endif if (ntimes) ADIOI_Free(write_buf); ADIOI_Free(curr_offlen_ptr); ADIOI_Free(count); ADIOI_Free(partial_recv); ADIOI_Free(send_size); ADIOI_Free(recv_size); ADIOI_Free(sent_to_proc); ADIOI_Free(start_pos); ADIOI_Free(send_buf_idx); ADIOI_Free(curr_to_proc); ADIOI_Free(done_to_proc);}static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, int *len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIOI_Access *others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *hole, int iter, MPI_Aint buftype_extent, int *buf_idx, int *error_code){ int i, j, k, *tmp_len, nprocs_recv, nprocs_send, err; char **send_buf = NULL; MPI_Request *requests; MPI_Datatype *recv_types; MPI_Status *statuses, status; int *srt_len, sum; ADIO_Offset *srt_off;#ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_W_EXCHANGE_DATA";#endif/* exchange recv_size info so that each process knows how much to send to whom. */ MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm); /* create derived datatypes for recv */ nprocs_recv = 0; for (i=0; i<nprocs; i++) if (recv_size[i]) nprocs_recv++; recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv+1)*sizeof(MPI_Datatype)); /* +1 to avoid a 0-size malloc */ tmp_len = (int *) ADIOI_Malloc(nprocs*sizeof(int)); j = 0; for (i=0; i<nprocs; i++) { if (recv_size[i]) {/* take care if the last off-len pair is a partial recv */ if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; tmp_len[i] = others_req[i].lens[k]; others_req[i].lens[k] = partial_recv[i]; } MPI_Type_hindexed(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types+j); /* absolute displacements; use MPI_BOTTOM in recv */ MPI_Type_commit(recv_types+j); j++; } } /* To avoid a read-modify-write, check if there are holes in the data to be written. For this, merge the (sorted) offset lists others_req using a heap-merge. */ sum = 0; for (i=0; i<nprocs; i++) sum += count[i]; srt_off = (ADIO_Offset *) ADIOI_Malloc((sum+1)*sizeof(ADIO_Offset)); srt_len = (int *) ADIOI_Malloc((sum+1)*sizeof(int)); /* +1 to avoid a 0-size malloc */ ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum);/* for partial recvs, restore original lengths */ for (i=0; i<nprocs; i++) if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; others_req[i].lens[k] = tmp_len[i]; } ADIOI_Free(tmp_len);/* check if there are any holes */ *hole = 0; for (i=0; i<sum-1; i++) if (srt_off[i]+srt_len[i] < srt_off[i+1]) { *hole = 1; break; } ADIOI_Free(srt_off); ADIOI_Free(srt_len); if (nprocs_recv) { if (*hole) { ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err); if (err != MPI_SUCCESS) {#ifdef PRINT_ERR_MSG FPRINTF(stderr, "ADIOI_GEN_WriteStridedColl: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.\n"); MPI_Abort(MPI_COMM_WORLD, 1);#else *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_READ_PERM, myname, (char *) 0, (char *) 0); ADIOI_Error(fd, *error_code, myname); return; #endif } } } nprocs_send = 0; for (i=0; i < nprocs; i++) if (send_size[i]) nprocs_send++; requests = (MPI_Request *) ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc *//* post receives */ j = 0; for (i=0; i<nprocs; i++) { if (recv_size[i]) { MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank+i+100*iter, fd->comm, requests+j); j++; } }/* post sends. if buftype_is_contig, data can be directly sent from user buf at location given by buf_idx. else use send_buf. */ if (buftype_is_contig) { j = 0; for (i=0; i < nprocs; i++) if (send_size[i]) { MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+nprocs_recv+j);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -