Question

作为一项挑战，我被要求制作一个用于反转矩阵的并行算法。在研究它时，我主要查看了this paper和this SO question。

在我尝试编写自己的代码之前，我偶然发现了someone else's implementation。

我来自一个客观的背景，所以我立刻想到使用GCD来完成这项任务。我还遇到了一些名为POSIX的东西，它看起来更低级，如果GCD不起作用，可能适合这项任务 - 我不知道。

我对这种并行化的天真尝试只是用dispatch_apply替换每个for循环，它起作用（原始和逆的乘积产生单位矩阵）。然而，这只是显着减慢了事情（大约20倍，一目了然）。我看到有SO questions on GCD and for-loops，但我主要关注的是什么是更好的方法，而不是我已经读过的那些答案的链接。问题可能是我创建调度队列的方式，还是我只使用一个调度队列的事实？

#include <stdio.h>
#include <dispatch/dispatch.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

#define PARALLEL true

void invertMatrixNonParallel(double **matrix, long n);
void invertMatrixParallel(double **matrix, long n, dispatch_queue_t q);

void invertMatrixParallel(double **matrix, long n, dispatch_queue_t q)
{
    __block double r;
    __block long temp;

    dispatch_apply(n, q, ^(size_t i) {
        dispatch_apply(n, q, ^(size_t j) {
            matrix[i][j + n] = (j == i) ? 1 : 0;
        });
    });
    /* using gauss-jordan elimination */

    dispatch_apply(n, q, ^(size_t j) {
        temp=j;

        /* finding maximum jth column element in last (n-j) rows */

        dispatch_apply(n - j - 1, q, ^(size_t i) {
            if (matrix[i + j + 1][j] > matrix[temp][j])
            {
                temp = i + j + 1;
            }
        });

        /* swapping row which has maximum jth column element */

        if(temp!=j)
        {
            double *row = matrix[j];
            matrix[j] = matrix[temp];
            matrix[temp] = row;
        }

        /* performing row operations to form required identity matrix out of the input matrix */
        dispatch_apply(n, q, ^(size_t i) {
            r = matrix[i][j];

            if (i == j)
            {
                dispatch_apply(2 * n, q, ^(size_t k) {
                    matrix[i][k]/=r ;
                });
            }
            else
            {
                dispatch_apply(2 * n, q, ^(size_t k) {
                    matrix[i][k]-=(matrix[j][k]/matrix[j][j])*r ;
                });
            }
        });
    });
}

void invertMatrixNonParallel(double **matrix, long n)
{
    double temporary, r;
    long i, j, k, temp;

    for (i = 0; i < n; ++i)
    {
        for (j = n; j < n * 2; ++j)
        {
            matrix[i][j] = (j == i + n) ? 1 : 0;
        }
    }
    /* using gauss-jordan elimination */

    for(j=0; j<n; j++)
    {
        temp=j;

        /* finding maximum jth column element in last (n-j) rows */

        for(i=j+1; i<n; i++)
            if(matrix[i][j]>matrix[temp][j])
                temp=i;

        /* swapping row which has maximum jth column element */

        if(temp!=j)
        {
            for(k=0; k<2*n; k++)
            {
                temporary=matrix[j][k] ;
                matrix[j][k]=matrix[temp][k] ;
                matrix[temp][k]=temporary ;
            }
        }

        /* performing row operations to form required identity matrix out of the input matrix */

        for(i=0; i<n; i++)
        {
            if(i!=j)
            {
                r=matrix[i][j];
                for(k=0; k<2*n; k++)
                    matrix[i][k]-=(matrix[j][k]/matrix[j][j])*r ;
            }
            else
            {
                r=matrix[i][j];
                for(k=0; k<2*n; k++)
                    matrix[i][k]/=r ;
            }
        }
    }
}

#pragma mark - Main

int main(int argc, const char * argv[])
{
    long i, j, k;
    const long n = 5;
    const double range = 10.0;
    __block double **matrix;
    __block double **invertedMatrix = malloc(sizeof(double *) * n);

    matrix = malloc(sizeof(double *) * n);
    invertedMatrix = malloc(sizeof(double *) * n);
    for (i = 0; i < n; ++i)
    {
        matrix[i] = malloc(sizeof(double) * n);
        invertedMatrix[i] = malloc(sizeof(double) * n * 2);
        for (j = 0; j < n; ++j)
        {
            matrix[i][j] = drand48() * range;
            invertedMatrix[i][j] = matrix[i][j];
        }
    }

    clock_t t;

#if PARALLEL
    dispatch_queue_t q1 = dispatch_queue_create("com.example.queue1", DISPATCH_QUEUE_CONCURRENT);
    t = clock();
    invertMatrixParallel(invertedMatrix, n, q1);
#else
    t = clock();
    invertMatrixNonParallel(invertedMatrix, n);
#endif

    t = clock() - t;
    double time_taken = ((double)t * 1000)/CLOCKS_PER_SEC; // in seconds

    printf("\n%s took %f milliseconds to execute \n\n", (PARALLEL == true) ? "Parallel" : "Non-Parallel", time_taken);

    printf("Here's the product of the inverted matrix and the original matrix\n");
    double product[n][n];
    for (i = 0; i < n; ++i)
    {
        for (j = 0; j < n; ++j)
        {
            double sum = 0;
            for (k = 0; k < n; ++k)
            {
                sum += matrix[i][k] * invertedMatrix[k][j + n];
            }
            product[i][j] = sum;
        }
    }

    // should print the identity matrix
    for (i = 0; i < n; ++i)
    {
        for (j = 0; j < n; ++j)
        {
            printf("%5.2f%s", product[i][j], (j < n - 1) ? ", " : "\n");
        }
    }

    return 0;
}

并行输出：

Parallel took 0.098000 milliseconds to execute

对于非平行：

Non-Parallel took 0.004000 milliseconds to execute

对于两者：

Here's the product of the inverted matrix and the original matrix
 1.00, -0.00, -0.00,  0.00, -0.00
 0.00,  1.00,  0.00,  0.00,  0.00
 0.00, -0.00,  1.00, -0.00,  0.00
-0.00, -0.00, -0.00,  1.00,  0.00
 0.00,  0.00,  0.00,  0.00,  1.00

拜托，没有答案只是链接，我只是使用SO作为最后的手段。

Answer 1

0）正如评论中已经提到的，你需要更大的矩阵。创建并行线程需要一些开销时间，因此如果它花费的时间太少，则无法使并行版本更快。即使你能够为小矩阵实现更好的性能，也很难准确测量。

1）

dispatch_apply(n, q, ^(size_t i) {
    dispatch_apply(n, q, ^(size_t j) {
        matrix[i][j + n] = (j == i) ? 1 : 0;
    });
});

每个嵌套循环的并行化没有多大意义。没有意义在调度队列中逐个添加每个操作，因为它仍然需要一些开销，所以最好添加一些非常重要的块。

dispatch_apply(n, q, ^(size_t i) {
    for (j = n; j < n * 2; ++j) {
        matrix[i][j + n] = (j == i) ? 1 : 0;
    }
});

够了。

2）您需要了解线程安全性并很好地理解您的算法，否则您可能会遇到应用程序的不可预测且不可重现的错误行为。我不确定是否有很多循环可以高效并且非常安全地并行，除了上面提到的初始化和一个标记有/ *执行行操作以形成输入矩阵所需的单位矩阵的循环* /

所以你可能需要找到一些特定的并行矩阵求逆算法。

并行化此算法以使其更快

1 个答案: