Question

您好，我遇到了MPI_Gatherv的问题，它无法“收集”值，因为它返回：

MPI_Gatherv中的致命错误：消息被截断，错误堆栈： MPI_Gatherv失败（sbuf = 0x000001E0AAE36920，scount = 16，MPI_INT， rbuf = 0x000001E0AAE367E0，rcnts = 0x000001E0AAE18500， displs = 0x0000005A09F6F9D8，MPI_INT，root = 0，MPI_COMM_WORLD）失败消息被截断；接收到16个字节，但缓冲区大小为16

代码使用语言C。 我的代码：

#include "stdio.h"
#include "mpi.h"
#include <stdlib.h>

int* multiply(int* x, int xLength, int* y, int yLength) {

    int* resultMatrix = (int *) malloc(xLength*yLength * sizeof(int));
    int r = 0;

    for (int i = 0; i < xLength; i++) {
        for (int j = 0; j < yLength; j++) {
            resultMatrix[r] = x[i] * y[j];
            printf("\nresult[%d]: %d", r, resultMatrix[r]);
            r++;
        }
    }
    return resultMatrix;
}

int* countOfValuesOfProcess(int matrixLength, int numOfProcesses) {
    int* countOfValuesOfProcess = (int*) malloc (numOfProcesses);
    for (int i = 0; i < numOfProcesses; i++) {
        if (i == numOfProcesses - 1) {
        countOfValuesOfProcess[i] = (matrixLength / numOfProcesses) + (matrixLength % numOfProcesses);
        }else countOfValuesOfProcess[i] = matrixLength / numOfProcesses;
    }
    return countOfValuesOfProcess;
}

int main(argc, argv)
int argc; char *argv[];
{
    int x[] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    int y[] = { 2,3, -1, 4 };
    int* result;
    int size, rank;
    int* recieveInt;
    MPI_Status status;
    MPI_Init(NULL, NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    int xSize = sizeof(x) / sizeof(x[0]);
    int ySize = sizeof(y) / sizeof(y[0]);


    result = (int *) malloc((xSize * ySize) * sizeof(int));

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int* numOfValuesPerProcess = countOfValuesOfProcess(xSize, size);
    int displs[4];

    recieveInt = (int *) malloc(numOfValuesPerProcess[rank] * sizeof(int));

int* resultPart = (int *) malloc((numOfValuesPerProcess[rank] * ySize) * sizeof(int));

//displs count
if (rank == 0) {
        displs[0] = 0;
        for (int i = 1; i < size; i++) {
            displs[i] = (displs[i - 1] + numOfValuesPerProcess[i - 1]);
        } 
    }

    MPI_Scatterv(x, numOfValuesPerProcess, displs, MPI_INT, recieveInt, numOfValuesPerProcess[rank], MPI_INT, 0, MPI_COMM_WORLD);

    resultPart = multiply(recieveInt, numOfValuesPerProcess[rank], y, ySize);

    MPI_Gatherv(resultPart, numOfValuesPerProcess[rank]*ySize, MPI_INT, result, numOfValuesPerProcess, displs, MPI_INT, 0, MPI_COMM_WORLD);

        free(resultPart);
        free(recieveInt);
    }
    MPI_Finalize();
    return(0);
}

当我替换collectv部分：numOfValuesPerProcess [rank] * ySize到numOfValuesPerProcess [rank]时，它将起作用，但结果将是：

gathered matrix[0]: 2
gathered matrix[1]: 3
gathered matrix[2]: -1
gathered matrix[3]: 4
gathered matrix[4]: 10
gathered matrix[5]: 15
gathered matrix[6]: -5
gathered matrix[7]: 20
gathered matrix[8]: 18
gathered matrix[9]: 27
gathered matrix[10]: -9
gathered matrix[11]: 36
gathered matrix[12]: 26
gathered matrix[13]: 39
gathered matrix[14]: -13
gathered matrix[15]: 52
gathered matrix[16]: -842150451
gathered matrix[17]: -842150451
gathered matrix[18]: -842150451
gathered matrix[19]: -842150451
gathered matrix[20]: -842150451
gathered matrix[21]: -842150451
gathered matrix[22]: -842150451
gathered matrix[23]: -842150451
gathered matrix[24]: -842150451
gathered matrix[25]: -842150451
gathered matrix[26]: -842150451
gathered matrix[27]: -842150451
gathered matrix[28]: -842150451
gathered matrix[29]: -842150451
gathered matrix[30]: -842150451
gathered matrix[31]: -842150451
gathered matrix[32]: -842150451
gathered matrix[33]: -842150451
gathered matrix[34]: -842150451
gathered matrix[35]: -842150451
gathered matrix[36]: -842150451
gathered matrix[37]: -842150451
gathered matrix[38]: -842150451
gathered matrix[39]: -842150451
gathered matrix[40]: -842150451
gathered matrix[41]: -842150451
gathered matrix[42]: -842150451
gathered matrix[43]: -842150451
gathered matrix[44]: -842150451
gathered matrix[45]: -842150451
gathered matrix[46]: -842150451
gathered matrix[47]: -842150451
gathered matrix[48]: -842150451
gathered matrix[49]: -842150451
gathered matrix[50]: -842150451
gathered matrix[51]: -842150451
gathered matrix[52]: -842150451
gathered matrix[53]: -842150451
gathered matrix[54]: -842150451
gathered matrix[55]: -842150451
gathered matrix[56]: -842150451
gathered matrix[57]: -842150451
gathered matrix[58]: -842150451
gathered matrix[59]: -842150451
gathered matrix[60]: -842150451
gathered matrix[61]: -842150451
gathered matrix[62]: -842150451
gathered matrix[63]: -842150451

我们可以看到收集了前16个数字，但其余的数字却丢失了（因为我们只希望完整结果的一部分）我不知道问题出在哪里，我试图为变量int *结果设置更大的内存分配，但是没有用。

哪里可能有问题？感谢您的所有建议

Answer 1

在MPI_Gatherv调用中，您从每个级别发送numOfValuesPerProcess [rank] * ySize元素，但仅在接收端为numOfValuesPerProcess [rank]元素保留空间。乘法之后，您发送/接收的数据是 ySize 倍，因此MPI_Gatherv的 recvcounts 和 displs 参数通话需要考虑该 ySize 因素。

顺便说一句，您似乎也有很多内存泄漏，没有足够的 frees 来容纳 mallocs 的数量。学习使用valgrind之类的工具来帮助查找和修复这些问题。

更新的代码：

#include "stdio.h"
#include "mpi.h"
#include <stdlib.h>

int* multiply(int* x, int xLength, int* y, int yLength) {

    int* resultMatrix = malloc(xLength*yLength * sizeof(int));
    int r = 0;

    for (int i = 0; i < xLength; i++) {
        for (int j = 0; j < yLength; j++) {
            resultMatrix[r] = x[i] * y[j];
            //printf("\nresult[%d]: %d", r, resultMatrix[r]);
            r++;
        }
    }
    return resultMatrix;
}

int* countOfValuesOfProcess(int matrixLength, int numOfProcesses) {
    int* countOfValuesOfProcess = malloc(numOfProcesses * sizeof(int));
    for (int i = 0; i < numOfProcesses; i++) 
    {
        if (i == numOfProcesses - 1) {
        countOfValuesOfProcess[i] = (matrixLength / numOfProcesses) + (matrixLength % numOfProcesses);
        }
        else
        {
         countOfValuesOfProcess[i] = matrixLength / numOfProcesses;
        }
    }
    return countOfValuesOfProcess;
}

int main(int argc, char *argv[])
{
    int x[] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    int y[] = { 2, 3, -1, 4 };
    int* result;
    int size, rank;
    int* recieveInt;
    MPI_Status status;
    MPI_Init(NULL, NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    int xSize = sizeof(x) / sizeof(x[0]);
    int ySize = sizeof(y) / sizeof(y[0]);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int* numOfValuesPerProcess = countOfValuesOfProcess(xSize, size);

    int displs[size];
    //displs count
    if (rank == 0) {
      displs[0] = 0;
      for (int i = 1; i < size; i++) {
        displs[i] = (displs[i - 1] + numOfValuesPerProcess[i - 1]);
      } 
    }

    recieveInt = malloc(numOfValuesPerProcess[rank] * sizeof(int));

    MPI_Scatterv(x, numOfValuesPerProcess, displs, MPI_INT, recieveInt, numOfValuesPerProcess[rank], MPI_INT, 0, MPI_COMM_WORLD);

    int* resultPart = multiply(recieveInt, numOfValuesPerProcess[rank], y, ySize);

    for (int i = 0; i < size; i++)
    {
      numOfValuesPerProcess[i] *= ySize;
      displs[i] *= ySize;
    }

    result = (int *) malloc((xSize * ySize) * sizeof(int));
    MPI_Gatherv(resultPart, numOfValuesPerProcess[rank], MPI_INT, result, numOfValuesPerProcess, displs, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0)
    {
      for (int i = 0; i < xSize*ySize; i++)
        printf("result[%d]: %d\n", i, result[i]);
    }

    free(resultPart);
    free(recieveInt);
    free(numOfValuesPerProcess);
    free(result);

    MPI_Finalize();
    return(0);
}

MPI Gatherv无法正常工作（消息被截断）

1 个答案: