C 使用填充防止错误共享_C_Caching_Concurrency_Pthreads_False Sharing

C 使用填充防止错误共享

c caching concurrency

C 使用填充防止错误共享,c,caching,concurrency,pthreads,false-sharing,C,Caching,Concurrency,Pthreads,False Sharing,我想计算一个大矩阵的和，但当我使用多个线程或仅使用一个线程时，我目前没有看到任何性能改进。我认为问题与错误共享有关，但我也在结构中添加了填充。请看一看 #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <time.h> #include <pthread.h> #define WIDTH 20000 pthread_mutex_t mylock =

我想计算一个大矩阵的和，但当我使用多个线程或仅使用一个线程时，我目前没有看到任何性能改进。我认为问题与错误共享有关，但我也在结构中添加了填充。请看一看

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>

#define WIDTH 20000 
pthread_mutex_t mylock = PTHREAD_MUTEX_INITIALIZER;

struct split { // sizeof(split) = 24 
    int start; 
    int end; 
    int* matrix; 
    int i; 
    char padding[64 - 24]; //Padding the private sum variables     forces them into separate cache lines and removes false sharing. Assume cache line is 64 bytes
};

int ran(){ 
    return rand() % 21; 
}
int* createBigMatrix(){
    int* a = malloc(sizeof(int)* WIDTH * WIDTH);
    for (int i = 0; i < WIDTH * WIDTH; i ++){ 
        a[i] = ran(); // fill up the matrix with random numbers
    }
    return a;
}
static int finalSum;
void* partialSum(void* arg){ 
    struct split* a = arg;
    int totalSum = 0; // create local variable
    int i;
    for (i = a->start; i <= a->end; i ++){  
        totalSum += a->matrix[i];
    }
    pthread_mutex_lock(&mylock);
    finalSum += totalSum; // critical section
    pthread_mutex_unlock(&mylock);  
    free(a);

    return 0;
} 
int main(){ //-294925289
    int useMultiThreads = 1; // there is no difference between using one thread or 4 therads
    finalSum = 0;
    pthread_t thread_ids[4];  
    // i want a square matrix of npages width 
    int* c = createBigMatrix();  

    printf("%lu\n", sizeof(struct split));
    if (useMultiThreads){
        // split the tasks evenly amoung 4 threads
        // since there are 20,000x20,000, there must be 400,000,000 cells 
        int start[] = {0, 100000000, 200000000, 300000000};
        int end[] = {99999999, 199999999, 299999999, 399999999}; 
        // calculate sum
        for (int i = 0; i < 4; i ++){
            struct split* a = malloc(sizeof(struct split));
            a->start = start[i];
            a->end = end[i];
            a->matrix = c;
            pthread_create(thread_ids + i, NULL, partialSum, a);
        }

        for (int i = 0; i < 4; i ++){ // join em up
            pthread_join(thread_ids[i], NULL);
        }
    }
    else { // use single thread
        for (int i = 0; i <= 399999999; i ++){
            finalSum += c[i];
        }
    }

    printf("total sum is %d\n", finalSum);
/*
    real    0m4.871s
    user    0m4.844s
    sys     0m0.392s
*/ 
    free(c);
    return 0;
}

#包括
#包括
#包括
#包括
#包括
#定义宽度20000
pthread\u mutex\u t mylock=pthread\u mutex\u初始值设定项；
结构拆分{//sizeof（拆分）=24
int启动；
内端；
int*矩阵；
int i；
char padding[64-24]；//填充私有和变量将强制它们进入单独的缓存线，并删除错误共享。假设缓存线为64字节
};
int-ran（）{
返回rand（）%21；
}
int*createBigMatrix（）{
int*a=malloc（sizeof（int）*宽度*宽度）；
对于（inti=0；istart；i end；i++）{
总和+=a->矩阵[i]；
}
pthread_mutex_lock（&mylock）；
finalSum+=totalSum；//临界截面
pthread_mutex_unlock（&mylock）；
免费（a）；
返回0；
} 
int main（）{/-294925289
int useMultiphreads=1；//使用一个线程或四个线程之间没有区别
finalSum=0；
pthread_t thread_id[4]；
//我想要一个npages宽度的正方形矩阵
int*c=createBigMatrix（）；
printf（“%lu\n”，sizeof（struct split））；
如果（使用多线程）{
//将任务平均分为4个线程
//因为有20000x2000个单元，所以必须有400000000个单元
int start[]={0,100000000,200000000,300000000}；
int end[]={9999999999992999999939999999}；
//求和
对于（int i=0；i<4；i++）{
结构拆分*a=malloc（sizeof（结构拆分））；
a->start=start[i]；
a->end=end[i]；
a->matrix=c；
pthread_create（thread_id+i，NULL，partialSum，a）；
}
对于（inti=0；i<4；i++）{//将它们连接起来
pthread_join（thread_id[i]，NULL）；
}
}
否则{//使用单线程
对于（int i=0；i我看不出结构的填充与代码的性能有什么关系。实际数据在指向的矩阵中
对于您所关心的问题，缺乏加速，这可能是因为您的代码完全是内存限制的。也就是说，要执行求和，必须通过内存总线从内存中获取数据。（您的矩阵太大，无法放入缓存。）也就是说，您的计算受到内存总线带宽的限制，该带宽由所有内核共享
还要注意的是，代码不是由求和控制的，而是由对ran（）的调用控制的
在程序的顺序部分。
由于线程使用的矩阵索引没有重叠，而且填充参数struct也没有帮助，因此似乎没有太大的错误共享空间。您如何测量求和所花费的时间？在我看来，该过程的整体性能在求和开始之前创建和加载庞大的数组会是一种主要的方式吗？请小心索引，int
显然不是这样的大型矩阵的正确类型。还要考虑到a->
在for
循环中的使用。编译器无法知道*a
是否会在h下发生变化您可以将a
更改为restrict
限定，但更简单的方法是将值（边界和矩阵）加载到局部变量中，并在循环中使用它们。