C 矩阵计算的性能测试
我试图计算所有在Z(N)中有一个逆的3x3矩阵,其中N是5和13。Basicle I计算矩阵模N的行列式,如果结果与N相对素数,则它有一个逆。我也需要对N等于26和256的对象执行此操作,但要找到它们需要很长时间 因此,当我发现生成n26的所有矩阵需要花费太多的时间时,我决定通过消除所有行列式为0的矩阵来减少测试矩阵的数量。所以我开始实现算法的剪枝部分。有趣的是,有了它,找到结果的时间更长了 为了找到性能下降的答案,我尝试了VisualStudioCPU评测和AMD UPRF,以找到热点区域。这没有帮助 最后,我试着从案例8开始,逐一评论交换机内部的优化。下面是这些测试的结果,如您所见,当测试的数量增加(计算的完整矩阵和行列式的数量)时,所需的时间会减少C 矩阵计算的性能测试,c,algorithm,performance,C,Algorithm,Performance,我试图计算所有在Z(N)中有一个逆的3x3矩阵,其中N是5和13。Basicle I计算矩阵模N的行列式,如果结果与N相对素数,则它有一个逆。我也需要对N等于26和256的对象执行此操作,但要找到它们需要很长时间 因此,当我发现生成n26的所有矩阵需要花费太多的时间时,我决定通过消除所有行列式为0的矩阵来减少测试矩阵的数量。所以我开始实现算法的剪枝部分。有趣的是,有了它,找到结果的时间更长了 为了找到性能下降的答案,我尝试了VisualStudioCPU评测和AMD UPRF,以找到热点区域。这
D:\hill_cipher\x64\Release>hill_cipher.exe
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1832992
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10562854484
time: 83s
D:\hill_cipher\x64\Release>hill_cipher.exe
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1865344
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10575805824
time: 81s
D:\hill_cipher\x64\Release>hill_cipher.exe
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1865344
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10575805824
time: 81s
D:\hill_cipher\x64\Release>hill_cipher.exe
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1893280
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10585310112
time: 80s
D:\hill_cipher\x64\Release>hill_cipher.exe
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1907600
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10590078096
time: 80s
D:\hill_cipher\x64\Release>hill_cipher.exe
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1922000
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10594847952
time: 78s
D:\hill_cipher\x64\Release>hill_cipher.exe
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1937500
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10599672564
time: 78s
D:\hill_cipher\x64\Release>hill_cipher.exe
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1953125
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10604499373
time: 74s
就我的一生而言,我不知道为什么优化需要更长的时间。有什么想法吗
#include <stdio.h>
#include <limits.h>
#include <time.h>
int gcd(int a, int b) {
if (a % b == 0) {
return b;
} else {
return gcd(b, a % b);
}
}
// second order determinant
/* maxtrix column based
0 2
1 3
*/
int det2(int *mat) {
return mat[0] * mat[3] - mat[1] * mat[2];
}
// third order determinant
/* maxtrix column based
0 3 6
1 4 7
2 5 8
*/
int det3(int* mat) {
return (mat[0] * mat[4] * mat[8]
+ mat[1] * mat[5] * mat[6]
+ mat[2] * mat[3] * mat[7]
- mat[6] * mat[4] * mat[2]
- mat[7] * mat[5] * mat[0]
- mat[8] * mat[3] * mat[1]);
}
#define PRINT3MAT(matrix) printf("\t%d %d %d\tautokey\n\t%d %d %d\n\t%d %d %d\n\n", matrix[0], matrix[3], matrix[6], \
matrix[1], matrix[4], matrix[7], \
matrix[2], matrix[5], matrix[8]);
#define PRINT2MAT(matrix) printf("\t%d %d\tautokey\n\t%d %d\n\n", matrix[0], matrix[2], \
matrix[1], matrix[3]);
bool autokey2(int* matrix) {
/* maxtrix column based
0 2
1 3
*/
bool result = true;
do {
if ((matrix[0] * matrix[0] + matrix[2] * matrix[1]) != 1) { // M(0,0)
result = false;
break;
}
if ((matrix[0] * matrix[2] + matrix[2] * matrix[3]) != 0) { // M(0,1)
result = false;
break;
}
if ((matrix[1] * matrix[0] + matrix[3] * matrix[1]) != 0) { // M(1,0)
result = false;
break;
}
if ((matrix[1] * matrix[2] + matrix[3] * matrix[3]) != 1) { // M(1,1)
result = false;
break;
}
} while (false);
return result;
}
bool autokey3(int* matrix) {
/* maxtrix column based
0 3 6
1 4 7
2 5 8
*/
bool result = true;
do {
if ((matrix[0] * matrix[0] + matrix[3] * matrix[1] + matrix[6] * matrix[2]) != 1) { // M(0,0)
result = false;
break;
}
if ((matrix[0] * matrix[3] + matrix[3] * matrix[4] + matrix[6] * matrix[5]) != 0) { // M(0,1)
result = false;
break;
}
if ((matrix[0] * matrix[6] + matrix[3] * matrix[7] + matrix[6] * matrix[8]) != 0) { // M(0,2)
result = false;
break;
}
if ((matrix[1] * matrix[0] + matrix[4] * matrix[1] + matrix[7] * matrix[2]) != 0) { // M(1,0)
result = false;
break;
}
if ((matrix[1] * matrix[3] + matrix[4] * matrix[4] + matrix[7] * matrix[5]) != 1) { // M(1,1)
result = false;
break;
}
if ((matrix[1] * matrix[6] + matrix[4] * matrix[7] + matrix[7] * matrix[8]) != 0) { // M(1,2)
result = false;
break;
}
if ((matrix[2] * matrix[0] + matrix[5] * matrix[1] + matrix[8] * matrix[2]) != 0) { // M(2,0)
result = false;
break;
}
if ((matrix[2] * matrix[3] + matrix[5] * matrix[4] + matrix[8] * matrix[5]) != 0) { // M(2,1)
result = false;
break;
}
if ((matrix[2] * matrix[6] + matrix[5] * matrix[7] + matrix[8] * matrix[8]) != 1) { // M(2,2)
result = false;
break;
}
} while (false);
return result;
}
int main()
{
bool key_test[4][256];
int N[] = { 5, 13, 26, 256 };
long long int possible_keys = 0;
long long int autokey = 0;
int matrix[9];
bool valid[9];
int pos;
int val;
int cst;
int max_pos, max_pn;
time_t start_t, end_t;
long long int nr_teste;
// generate all inversable values
for (int i = 0; i < 4; i++) {
for (int j = 0; j < N[i]; j++) {
if (gcd(j, N[i]) == 1) {
key_test[i][j] = true;
} else {
key_test[i][j] = false;
}
}
}
start_t = time(NULL);
for (int d = 3; d <= 3; d++) {
max_pos = d * d - 1;
if (d == 2) {
max_pn = 4;
} else {
max_pn = 2;
}
// check all the Z(n) rings
for (int pn = 0; pn < max_pn; pn++) {
//init matrix
for (int i = 0; i < d*d; i++) {
matrix[i] = 0;
valid[i] = false;
}
possible_keys = 0;
autokey = 0;
nr_teste = 0;
// generate matrix
pos = 0;
while (pos >= 0) {
if (valid[pos] == false) {
matrix[pos] = 0;
valid[pos] = true;
} else {
if (matrix[pos] + 1 < N[pn]) {
matrix[pos] ++;
} else {
// No more values for this position
valid[pos] = false;
//matrix[pos] = 0;
pos--;
continue;
}
}
// For some reason if I add this, it will be slower
if (d == 3) {
// pruning
switch (pos)
{
case 2: {
/* maxtrix column based
0 3 6
1 4 7
2 5 8
*/
if (matrix[2] == 0 && matrix[1] == 0 && matrix[0] == 0) {
continue;
}
break;
}
case 5: {
/* maxtrix column based
0 3 6
1 4 7
2 5 8
*/
if (matrix[5] == 0 && matrix[4] == 0 && matrix[3] == 0) {
continue;
}
if (matrix[5] <= matrix[2] && matrix[5] != 0) {
cst = matrix[2] / matrix[5];
if ((matrix[2] % matrix[5] == 0) && (cst * matrix[4] == matrix[1]) && (cst * matrix[3] == matrix[0])) {
continue;
}
} else {
if (matrix[2] != 0) {
cst = matrix[5] / matrix[2];
if ((matrix[5] % matrix[2] == 0) && (cst * matrix[1] == matrix[4]) && (cst * matrix[0] == matrix[3])) {
continue;
}
}
}
break;
}
case 6: {
/* maxtrix column based
0 3 6
1 4 7
2 5 8
*/
if (matrix[6] == 0 && matrix[3] == 0 && matrix[0] == 0) {
continue;
}
break;
}
case 7: {
/* maxtrix column based
0 3 6
1 4 7
2 5 8
*/
if (matrix[7] == 0 && matrix[4] == 0 && matrix[1] == 0) {
continue;
}
if (matrix[7] <= matrix[6] && matrix[7] != 0) {
cst = matrix[6] / matrix[7];
if ((matrix[6] % matrix[7] == 0) && (cst * matrix[4] == matrix[3]) && (cst * matrix[1] == matrix[0])) {
continue;
}
} else {
if (matrix[6] != 0) {
cst = matrix[7] / matrix[6];
if ((matrix[7] % matrix[6] == 0) && (cst * matrix[3] == matrix[4]) && (cst * matrix[0] == matrix[1])) {
continue;
}
}
}
break;
}
case 8: {
/* maxtrix column based
0 3 6
1 4 7
2 5 8
*/
if (matrix[8] == 0 && matrix[7] == 0 && matrix[6] == 0) {
continue;
}
if (matrix[8] == 0 && matrix[5] == 0 && matrix[2] == 0) {
continue;
}
// does not cat much and is rare
break;
}
}
}
//else {
// switch (pos)
// {
// case 1: {
// /* maxtrix column based
// 0 2
// 1 3
// */
// if (matrix[1] == 0 && matrix[0] == 0) {
// continue;
// }
// break;
// }
// case 2: {
// /* maxtrix column based
// 0 2
// 1 3
// */
// if (matrix[0] == 0 && matrix[2] == 0) {
// continue;
// }
// break;
// }
// case 3: {
// /* maxtrix column based
// 0 2
// 1 3
// */
// if (matrix[1] == 0 && matrix[3] == 0) {
// continue;
// }
// if (matrix[2] == 0 && matrix[3] == 0) {
// continue;
// }
// // does not cat much and is rare
// break;
// }
// }
//}
if (pos != max_pos) {
if (valid[pos] == true) {
pos++;
}
} else {
// we have a complete matrix
nr_teste++;
if (d == 3) {
/* maxtrix column based
0 3 6
1 4 7
2 5 8
*/
val = matrix[0] * matrix[4] * matrix[8]
+ matrix[1] * matrix[5] * matrix[6]
+ matrix[2] * matrix[3] * matrix[7]
- matrix[6] * matrix[4] * matrix[2]
- matrix[7] * matrix[5] * matrix[0]
- matrix[8] * matrix[3] * matrix[1];
} else {
/* maxtrix column based
0 2
1 3
*/
val = matrix[0] * matrix[3] - matrix[1] * matrix[2];
}
val = val % N[pn];
if (val < 0) {
val += N[pn]; // make sure the result is positive
}
if (key_test[pn][val]) { // val == 0 will return false
possible_keys++;
if (d == 3) {
if (autokey3(matrix)) {
autokey++;
//PRINT3MAT(matrix);
}
} else {
if (autokey2(matrix)) {
autokey++;
//PRINT2MAT(matrix);
}
}
}
}
}
printf("d: %d N: %3d possible keys: %10lld autokey: %lld nr teste: %11lld\n", d, N[pn], possible_keys, autokey, nr_teste);
//break;
}
//break;
}
end_t = time(NULL);
printf("time: %llds\n", end_t - start_t);
return 0;
}
没有修剪,我得到了:
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1832992
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10562854484
time: 91s
Performance counter stats for './hill_cipher':
91.141,55 msec task-clock # 0,999 CPUs utilized
368.707.285.660 cycles # 4,045 GHz (83,33%)
3.535 context-switches # 0,039 K/sec
52 page-faults # 0,001 K/sec
29.044.482 cache-misses (83,33%)
368.706.753.424 cycles # 4,045 GHz (66,66%)
861.658.443.048 instructions # 2,34 insn per cycle (83,33%)
94.135.383.666 branches # 1032,848 M/sec (83,33%)
1.571.177.575 branch-misses # 1,67% of all branches (83,34%)
91,219342601 seconds time elapsed
91,079478000 seconds user
0,063996000 seconds sys
d: 3 N: 5 possible keys: 1488000 autokey: 4 nr teste: 1953125
d: 3 N: 13 possible keys: 9726417792 autokey: 4 nr teste: 10604499373
time: 78s
Performance counter stats for './hill_cipher':
78.072,56 msec task-clock # 1,000 CPUs utilized
315.655.551.124 cycles # 4,043 GHz (83,33%)
274 context-switches # 0,004 K/sec
52 page-faults # 0,001 K/sec
21.543.133 cache-misses (83,33%)
315.623.715.271 cycles # 4,043 GHz (66,67%)
761.423.725.679 instructions # 2,41 insn per cycle (83,33%)
57.487.582.295 branches # 736,335 M/sec (83,33%)
418.093.257 branch-misses # 0,73% of all branches (83,33%)
78,077203434 seconds time elapsed
78,066022000 seconds user
0,007999000 seconds sys
编译器命令是:
gcc -g -Wall -Wextra -Wshadow -Wwrite-strings -m64 -O3 -flto -fPIC -Iinclude -c -o bin/release_64/obj/hill_cipher.o src/hill_cipher.cpp
gcc -g -m64 -O3 -flto -fno-omit-frame-pointer -Iinclude -o bin/release_64/hill_cipher bin/release_64/obj/hill_cipher.o
您是否尝试过使用探查器运行这些不同的版本?我推测“聪明”版本会因为分支成本而受损。我可以重现你的一般观察:
d:3n:5个可能的键:1488000自动键:4nr测试:1832992;d:3 N:13个可能的键:9726417792自动键:4个测试:10562854484;时间:132秒;;d:3 N:5个可能的键:1488000自动键:4个测试:1953125;d:3 N:13个可能的键:9726417792自动键:4个测试:10604499373;时间:109s
我没有测量失败分支的数量。您的编译器标志是什么?您可以测量每个修剪测试保存了多少操作吗?您可能还可以通过使用逐位操作简化测试来加快速度,尽管这可能需要不同的矩阵表示形式。我没有更改编译器标志。它们是VisualStudio发布的标准版本。我检查并设置了\O2标志,并设置了优惠速度。但是在所有的测试中,标记都是相同的。@Tandura它可能不仅仅是分支预测失误。请注意,在慢速情况下执行的指令总数也明显较高,这可能是大部分时间增加的原因。这可能是由于(1)执行额外的比较指令和分支指令(2)分支干扰某些编译器优化,从CSE(公共子表达式消除)到指令调度(例如,提前调度负载)。看看生成的机器代码:静态指令计数的增加可能已经暗示了动态指令计数的增加。