Caching 错误共享和缓存对齐我在C++中有以下代码： #include <stdio.h> #include <string> #include <vector> using namespace std; struct th_private{ double mean_tau; th_private() { mean_tau = 0; } }; class resistor { public: string name; /***************************************************************************** Approach 0: Within each resistor strcuture, declare arrays of 'thread private' variables. Thread 0 will use mean_tau[0], offset[0].., Thread 1 will use mean_tau[1], offset[1]... and so on. As I understand, this is not a good approach, would lead to a lot of false sharing. /*****************************************************************************/ vector<double> mean_tau; /***************************************************************************** Approach 1: 1D array of struct th_private in each instance of the resistor, where state[0] is used ONLY by thread[0], state[0] is used ONLY by thread[1] and so on. Could potentially elimiate false sharing, but how to ensure it will align in the cache? /*****************************************************************************/ vector<th_private> state; resistor( ) { name = ""; } void prepare_for_threads( int num_threads ) { /* If Approach 0 */ mean_tau.resize(num_threads); /* Else If Approach 1 */ state.resize(num_threads); } ~resistor(){} }; class mesh { public: vector<resistor*> R; mesh( int num_resistors, int num_threads ) { for( int i = 0; i < num_resistors; i++ ) { resistor *r = new resistor(); r->prepare_for_threads( num_threads ); R.push_back(r); } } ~mesh(){} }; /***************************************************************************** Approach 2: Declare a global 2D matrix, where each row belongs to a thread and each column belongs to a resistor. Seems to be the best approach. R[0] R[1] R[2] R[3] R[4] R[9] thread0: [0][0] [0][1] [0][2] [0][3] [0][4] .. [0][9] ... thread3: [3][0] [3][1] [3][2] [3][3] [3][4] .. [3][9] /*****************************************************************************/ th_private __attribute__((aligned(0x1000))) global_state[4][10]; int main( int argc, char** argv ) { // Assume that there are 4 threads declared. mesh grid(10, 4); printf("sizeof(th_private): %d\n", sizeof(th_private)); printf("Approach 1: %p %p %p %p\n", &grid.R[0]->state[0], &grid.R[0]->state[1], &grid.R[0]->state[2], &grid.R[0]->state[3]); printf("Approach 2: %p %p %p %p\n", &global_state[0][0], &global_state[0][1], &global_state[0][2], &global_state[0][3]); }_Caching_Parallel Processing_False Sharing

caching/
Caching 错误共享和缓存对齐我在C++中有以下代码： #include <stdio.h> #include <string> #include <vector> using namespace std; struct th_private{ double mean_tau; th_private() { mean_tau = 0; } }; class resistor { public: string name; /***************************************************************************** Approach 0: Within each resistor strcuture, declare arrays of 'thread private' variables. Thread 0 will use mean_tau[0], offset[0].., Thread 1 will use mean_tau[1], offset[1]... and so on. As I understand, this is not a good approach, would lead to a lot of false sharing. /*****************************************************************************/ vector<double> mean_tau; /***************************************************************************** Approach 1: 1D array of struct th_private in each instance of the resistor, where state[0] is used ONLY by thread[0], state[0] is used ONLY by thread[1] and so on. Could potentially elimiate false sharing, but how to ensure it will align in the cache? /*****************************************************************************/ vector<th_private> state; resistor( ) { name = ""; } void prepare_for_threads( int num_threads ) { /* If Approach 0 */ mean_tau.resize(num_threads); /* Else If Approach 1 */ state.resize(num_threads); } ~resistor(){} }; class mesh { public: vector<resistor*> R; mesh( int num_resistors, int num_threads ) { for( int i = 0; i < num_resistors; i++ ) { resistor *r = new resistor(); r->prepare_for_threads( num_threads ); R.push_back(r); } } ~mesh(){} }; /***************************************************************************** Approach 2: Declare a global 2D matrix, where each row belongs to a thread and each column belongs to a resistor. Seems to be the best approach. R[0] R[1] R[2] R[3] R[4] R[9] thread0: [0][0] [0][1] [0][2] [0][3] [0][4] .. [0][9] ... thread3: [3][0] [3][1] [3][2] [3][3] [3][4] .. [3][9] /*****************************************************************************/ th_private __attribute__((aligned(0x1000))) global_state[4][10]; int main( int argc, char** argv ) { // Assume that there are 4 threads declared. mesh grid(10, 4); printf("sizeof(th_private): %d\n", sizeof(th_private)); printf("Approach 1: %p %p %p %p\n", &grid.R[0]->state[0], &grid.R[0]->state[1], &grid.R[0]->state[2], &grid.R[0]->state[3]); printf("Approach 2: %p %p %p %p\n", &global_state[0][0], &global_state[0][1], &global_state[0][2], &global_state[0][3]); }

Caching 错误共享和缓存对齐我在C++中有以下代码： #include <stdio.h> #include <string> #include <vector> using namespace std; struct th_private{ double mean_tau; th_private() { mean_tau = 0; } }; class resistor { public: string name; /***************************************************************************** Approach 0: Within each resistor strcuture, declare arrays of 'thread private' variables. Thread 0 will use mean_tau[0], offset[0].., Thread 1 will use mean_tau[1], offset[1]... and so on. As I understand, this is not a good approach, would lead to a lot of false sharing. /*****************************************************************************/ vector<double> mean_tau; /***************************************************************************** Approach 1: 1D array of struct th_private in each instance of the resistor, where state[0] is used ONLY by thread[0], state[0] is used ONLY by thread[1] and so on. Could potentially elimiate false sharing, but how to ensure it will align in the cache? /*****************************************************************************/ vector<th_private> state; resistor( ) { name = ""; } void prepare_for_threads( int num_threads ) { /* If Approach 0 */ mean_tau.resize(num_threads); /* Else If Approach 1 */ state.resize(num_threads); } ~resistor(){} }; class mesh { public: vector<resistor*> R; mesh( int num_resistors, int num_threads ) { for( int i = 0; i < num_resistors; i++ ) { resistor *r = new resistor(); r->prepare_for_threads( num_threads ); R.push_back(r); } } ~mesh(){} }; /***************************************************************************** Approach 2: Declare a global 2D matrix, where each row belongs to a thread and each column belongs to a resistor. Seems to be the best approach. R[0] R[1] R[2] R[3] R[4] R[9] thread0: [0][0] [0][1] [0][2] [0][3] [0][4] .. [0][9] ... thread3: [3][0] [3][1] [3][2] [3][3] [3][4] .. [3][9] /*****************************************************************************/ th_private __attribute__((aligned(0x1000))) global_state[4][10]; int main( int argc, char** argv ) { // Assume that there are 4 threads declared. mesh grid(10, 4); printf("sizeof(th_private): %d\n", sizeof(th_private)); printf("Approach 1: %p %p %p %p\n", &grid.R[0]->state[0], &grid.R[0]->state[1], &grid.R[0]->state[2], &grid.R[0]->state[3]); printf("Approach 2: %p %p %p %p\n", &global_state[0][0], &global_state[0][1], &global_state[0][2], &global_state[0][3]); }

caching parallel-processing

Caching 错误共享和缓存对齐我在C++中有以下代码： #include <stdio.h> #include <string> #include <vector> using namespace std; struct th_private{ double mean_tau; th_private() { mean_tau = 0; } }; class resistor { public: string name; /***************************************************************************** Approach 0: Within each resistor strcuture, declare arrays of 'thread private' variables. Thread 0 will use mean_tau[0], offset[0].., Thread 1 will use mean_tau[1], offset[1]... and so on. As I understand, this is not a good approach, would lead to a lot of false sharing. /*****************************************************************************/ vector<double> mean_tau; /***************************************************************************** Approach 1: 1D array of struct th_private in each instance of the resistor, where state[0] is used ONLY by thread[0], state[0] is used ONLY by thread[1] and so on. Could potentially elimiate false sharing, but how to ensure it will align in the cache? /*****************************************************************************/ vector<th_private> state; resistor( ) { name = ""; } void prepare_for_threads( int num_threads ) { /* If Approach 0 */ mean_tau.resize(num_threads); /* Else If Approach 1 */ state.resize(num_threads); } ~resistor(){} }; class mesh { public: vector<resistor*> R; mesh( int num_resistors, int num_threads ) { for( int i = 0; i < num_resistors; i++ ) { resistor *r = new resistor(); r->prepare_for_threads( num_threads ); R.push_back(r); } } ~mesh(){} }; /***************************************************************************** Approach 2: Declare a global 2D matrix, where each row belongs to a thread and each column belongs to a resistor. Seems to be the best approach. R[0] R[1] R[2] R[3] R[4] R[9] thread0: [0][0] [0][1] [0][2] [0][3] [0][4] .. [0][9] ... thread3: [3][0] [3][1] [3][2] [3][3] [3][4] .. [3][9] /*****************************************************************************/ th_private __attribute__((aligned(0x1000))) global_state[4][10]; int main( int argc, char** argv ) { // Assume that there are 4 threads declared. mesh grid(10, 4); printf("sizeof(th_private): %d\n", sizeof(th_private)); printf("Approach 1: %p %p %p %p\n", &grid.R[0]->state[0], &grid.R[0]->state[1], &grid.R[0]->state[2], &grid.R[0]->state[3]); printf("Approach 2: %p %p %p %p\n", &global_state[0][0], &global_state[0][1], &global_state[0][2], &global_state[0][3]); },caching,parallel-processing,false-sharing,Caching,Parallel Processing,False Sharing,每个电阻器都有一组属性，可通过线程进行读写修改。理想情况下，它们可以被视为线程专用变量。但是，由于旧代码库施加的一些限制，我只能选择以下三种方法之一：方法0：在每个电阻器结构中，声明“线程专用”数组变量。线程0将使用均值τ[0]，偏移量[0]…，线程1将使用平均_头[1]，偏移量[1]。。。等等据我所知，这不是一个好主意这种做法，会导致很多虚假的分享。方法1：电阻器每个实例中的1D结构THU私有阵列，其中状态[0]仅由线程[0]使用，状态[0]仅由线程[1]使用等等可能会消除虚假共

每个电阻器都有一组属性，可通过线程进行读写修改。理想情况下，它们可以被视为线程专用变量。但是，由于旧代码库施加的一些限制，我只能选择以下三种方法之一：

方法0：在每个电阻器结构中，声明“线程专用”数组变量。线程0将使用均值τ[0]，偏移量[0]…，线程1将使用平均_头[1]，偏移量[1]。。。等等据我所知，这不是一个好主意这种做法，会导致很多虚假的分享。方法1：电阻器每个实例中的1D结构THU私有阵列，其中状态[0]仅由线程[0]使用，状态[0]仅由线程[1]使用等等可能会消除虚假共享，但如何确保它将在缓存中对齐吗？方法2：声明一个全局2D矩阵，其中每一行属于一个线程和每个列都属于一个电阻器，代码中有更多详细信息。

现在，我要避免错误共享，我要进行缓存对齐，哪种方法最好？

嗯，代码太多了。为了使您的情况更容易理解，您能否简化此过程？Re:struct size。编译器可能已经在4字节边界上对齐了每个非双精度成员，以简化访问。Re：缓存对齐。最糟糕的情况是，这意味着每个结构占用的缓存线比预期的多一条。这很重要吗？谢谢奥利的快速回复。我修改了代码，删除了您刚才回答的最后一个子问题。顺便说一句，你的ans是有意义的：它会是，因为我会有几乎一百万份每个线程的每个电阻。这更容易摸索，谢谢！是的，我同意百万是件大事。。。

sizeof(th_private): 8
Approach 1: 0x658080 0x658088 0x658090 0x658098
Approach 2: 0x608000 0x608008 0x608010 0x608018

[parallel processing]相关文章推荐

随机文章推荐