C 将此算法并行化以使其更快_C_Algorithm_Parallel Processing_Grand Central Dispatch_Matrix Inverse

C 将此算法并行化以使其更快

c algorithm parallel-processing

C 将此算法并行化以使其更快,c,algorithm,parallel-processing,grand-central-dispatch,matrix-inverse,C,Algorithm,Parallel Processing,Grand Central Dispatch,Matrix Inverse,作为一个挑战，我被要求做一个矩阵求逆的并行算法。我主要是在为它做研究的时候看和在我试图编写自己的代码之前，我无意中发现了我来自objective-c背景，因此我立即想到使用GCD完成这项任务。我还遇到了一个叫做的东西，它看起来更低级，如果GCD不起作用，它可能适合这个任务——我不知道我对并行化的天真尝试只是将每个for循环替换为一个dispatch\u apply，它起作用了（原始循环和逆循环的乘积生成了单位矩阵）。然而，这只是大大减慢了速度（大约是乍一看速度的20倍）。我知道有，但我主要

作为一个挑战，我被要求做一个矩阵求逆的并行算法。我主要是在为它做研究的时候看和

在我试图编写自己的代码之前，我无意中发现了

我来自objective-c背景，因此我立即想到使用GCD完成这项任务。我还遇到了一个叫做的东西，它看起来更低级，如果GCD不起作用，它可能适合这个任务——我不知道

我对并行化的天真尝试只是将每个for循环替换为一个

dispatch\u apply

，它起作用了（原始循环和逆循环的乘积生成了单位矩阵）。然而，这只是大大减慢了速度（大约是乍一看速度的20倍）。我知道有，但我主要感兴趣的是如何更好地解决这个问题，而不是我已经读过的那些答案的链接。问题是否可能是我创建调度队列的方式，或者可能是我只使用了一个调度队列

#include <stdio.h>
#include <dispatch/dispatch.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

#define PARALLEL true

void invertMatrixNonParallel(double **matrix, long n);
void invertMatrixParallel(double **matrix, long n, dispatch_queue_t q);

void invertMatrixParallel(double **matrix, long n, dispatch_queue_t q)
{
    __block double r;
    __block long temp;

    dispatch_apply(n, q, ^(size_t i) {
        dispatch_apply(n, q, ^(size_t j) {
            matrix[i][j + n] = (j == i) ? 1 : 0;
        });
    });
    /* using gauss-jordan elimination */

    dispatch_apply(n, q, ^(size_t j) {
        temp=j;

        /* finding maximum jth column element in last (n-j) rows */

        dispatch_apply(n - j - 1, q, ^(size_t i) {
            if (matrix[i + j + 1][j] > matrix[temp][j])
            {
                temp = i + j + 1;
            }
        });

        /* swapping row which has maximum jth column element */

        if(temp!=j)
        {
            double *row = matrix[j];
            matrix[j] = matrix[temp];
            matrix[temp] = row;
        }

        /* performing row operations to form required identity matrix out of the input matrix */
        dispatch_apply(n, q, ^(size_t i) {
            r = matrix[i][j];

            if (i == j)
            {
                dispatch_apply(2 * n, q, ^(size_t k) {
                    matrix[i][k]/=r ;
                });
            }
            else
            {
                dispatch_apply(2 * n, q, ^(size_t k) {
                    matrix[i][k]-=(matrix[j][k]/matrix[j][j])*r ;
                });
            }
        });
    });
}

void invertMatrixNonParallel(double **matrix, long n)
{
    double temporary, r;
    long i, j, k, temp;

    for (i = 0; i < n; ++i)
    {
        for (j = n; j < n * 2; ++j)
        {
            matrix[i][j] = (j == i + n) ? 1 : 0;
        }
    }
    /* using gauss-jordan elimination */

    for(j=0; j<n; j++)
    {
        temp=j;

        /* finding maximum jth column element in last (n-j) rows */

        for(i=j+1; i<n; i++)
            if(matrix[i][j]>matrix[temp][j])
                temp=i;

        /* swapping row which has maximum jth column element */

        if(temp!=j)
        {
            for(k=0; k<2*n; k++)
            {
                temporary=matrix[j][k] ;
                matrix[j][k]=matrix[temp][k] ;
                matrix[temp][k]=temporary ;
            }
        }

        /* performing row operations to form required identity matrix out of the input matrix */

        for(i=0; i<n; i++)
        {
            if(i!=j)
            {
                r=matrix[i][j];
                for(k=0; k<2*n; k++)
                    matrix[i][k]-=(matrix[j][k]/matrix[j][j])*r ;
            }
            else
            {
                r=matrix[i][j];
                for(k=0; k<2*n; k++)
                    matrix[i][k]/=r ;
            }
        }
    }
}

#pragma mark - Main

int main(int argc, const char * argv[])
{
    long i, j, k;
    const long n = 5;
    const double range = 10.0;
    __block double **matrix;
    __block double **invertedMatrix = malloc(sizeof(double *) * n);

    matrix = malloc(sizeof(double *) * n);
    invertedMatrix = malloc(sizeof(double *) * n);
    for (i = 0; i < n; ++i)
    {
        matrix[i] = malloc(sizeof(double) * n);
        invertedMatrix[i] = malloc(sizeof(double) * n * 2);
        for (j = 0; j < n; ++j)
        {
            matrix[i][j] = drand48() * range;
            invertedMatrix[i][j] = matrix[i][j];
        }
    }

    clock_t t;

#if PARALLEL
    dispatch_queue_t q1 = dispatch_queue_create("com.example.queue1", DISPATCH_QUEUE_CONCURRENT);
    t = clock();
    invertMatrixParallel(invertedMatrix, n, q1);
#else
    t = clock();
    invertMatrixNonParallel(invertedMatrix, n);
#endif

    t = clock() - t;
    double time_taken = ((double)t * 1000)/CLOCKS_PER_SEC; // in seconds

    printf("\n%s took %f milliseconds to execute \n\n", (PARALLEL == true) ? "Parallel" : "Non-Parallel", time_taken);

    printf("Here's the product of the inverted matrix and the original matrix\n");
    double product[n][n];
    for (i = 0; i < n; ++i)
    {
        for (j = 0; j < n; ++j)
        {
            double sum = 0;
            for (k = 0; k < n; ++k)
            {
                sum += matrix[i][k] * invertedMatrix[k][j + n];
            }
            product[i][j] = sum;
        }
    }

    // should print the identity matrix
    for (i = 0; i < n; ++i)
    {
        for (j = 0; j < n; ++j)
        {
            printf("%5.2f%s", product[i][j], (j < n - 1) ? ", " : "\n");
        }
    }

    return 0;
}

对于非平行：

Parallel took 0.098000 milliseconds to execute

Non-Parallel took 0.004000 milliseconds to execute

对于这两种情况：

Here's the product of the inverted matrix and the original matrix
 1.00, -0.00, -0.00,  0.00, -0.00
 0.00,  1.00,  0.00,  0.00,  0.00
 0.00, -0.00,  1.00, -0.00,  0.00
-0.00, -0.00, -0.00,  1.00,  0.00
 0.00,  0.00,  0.00,  0.00,  1.00

请不要回答那些只是链接的问题，我只是在万不得已的情况下才这么做。

0）正如评论中已经提到的，你需要更大的矩阵。创建并行线程需要一些开销时间，所以如果它花费的时间太少，您就无法生成更快的并行版本。即使您能够为小型矩阵实现更好的性能，也很难精确测量

(一)

每个嵌套循环的并行化并没有多大意义。在调度队列中逐个添加每个操作是没有意义的，因为它仍然需要一些开销，所以最好添加一些非平凡的块

dispatch_apply(n, q, ^(size_t i) {
    for (j = n; j < n * 2; ++j) {
        matrix[i][j + n] = (j == i) ? 1 : 0;
    }
});

dispatch\u apply（n，q，^（大小）{
对于（j=n；j


够了
2） 您需要了解线程安全并充分理解算法，否则您可能会遇到应用程序不可预测且不可再现的错误行为。我不确定是否有许多循环可以有效且真正安全地并行，除了上面提到的初始化，还有一个标记为/*的循环执行行操作以从输入矩阵中形成所需的标识矩阵*/
因此，您可能需要找到一些特定的并行矩阵求逆算法。
对于更高秩的矩阵，请尝试。“对于5x5矩阵，我怀疑并行解决方案会更快。”Bathsheba提出了一个很好的建议。目前，并行版本的级别较高存在问题。我相信这是因为嵌套的dispatch\u apply
s使用相同的队列，所以我将尝试解决这个问题并回到这个问题。您想要多大的矩阵（适合内存两次？还是从光盘加载？）另外，请不要忘记，主板缓存大小是有限的，如果您跨越了这一障碍，并且无法对子数据进行计算，那么并行版本将变得慢得多。请注意：您使用GEM，它大量使用if，并且还需要一些行的预排序（用于哪个对角线1）为了避免无效的结果（在错误的顺序上，存在逆运算，但特别是对于包含多个零的矩阵，却找不到逆运算），这会稍微减慢速度。您是否尝试过通过更适合并行计算的行列式计算逆运算？（但是你必须小心递归堆栈/堆垃圾）或者你必须被GEM卡住？@Spektre，我不必使用GEM。请，如果你知道一个更好的并行矩阵求逆算法。我很想听。
dispatch_apply(n, q, ^(size_t i) {
    for (j = n; j < n * 2; ++j) {
        matrix[i][j + n] = (j == i) ? 1 : 0;
    }
});