C：使用查找表在有限小整数值集上计算函数的最快方法？_C

C：使用查找表在有限小整数值集上计算函数的最快方法？

C：使用查找表在有限小整数值集上计算函数的最快方法？,c,C,我目前正在从事一个项目，我想通过调用C来优化Python中的一些数值计算简而言之，我需要为一个巨大数组中的每个元素计算y[I]=f（x[I]）（通常有10^9条目或更多条目）。这里，x[i]是一个介于-10和10之间的整数，f是一个接受x[i]并返回双精度的函数。我的问题是f，但以数值稳定的方式进行计算需要很长的时间为了加快速度，我想将所有2*10+1可能的f（x[I]）值硬编码到常量数组中，例如：双表_值[]={f（-10），..，f（10）} 然后使用“查找表”方法对f进行评估，如下所

我目前正在从事一个项目，我想通过调用C来优化Python中的一些数值计算

简而言之，我需要为一个巨大数组中的每个元素计算

y[I]=f（x[I]）

（通常有

10^9

条目或更多条目）。这里，

x[i]

是一个介于-10和10之间的整数，

是一个接受

x[i]

并返回双精度的函数。我的问题是

，但以数值稳定的方式进行计算需要很长的时间
为了加快速度，我想将所有
2*10+1
可能的
f（x[I]）
值硬编码到常量数组中，例如：

双表_值[]={f（-10），..，f（10）} 然后使用“查找表”方法对f 进行评估，如下所示： for (i = 0; i < N; i++) { y[i] = table_of_values[x[i] + 11]; //instead of y[i] = f(x[i]) } （i=0；i{ y[i]=由_值组成的表[x[i]+11]；//而不是y[i]=f（x[i]） }
由于我不太擅长用C编写优化代码，我想知道：具体地说，由于x 非常大，我想知道在评估循环时是否值得进行二次优化（例如，通过预先排序x ，或者通过找到一种处理负指数的智能方法（除了只做[x[I]+10+1] ）假设x[i] 不是介于-10和10之间，而是介于-20和20之间。在这种情况下，我仍然可以使用相同的方法，但需要手动硬编码查找表。是否有方法在代码中动态生成查找表，以便我使用相同的方法并允许x[i] 是否属于可变范围您的数组中可以有负索引。（我不确定这是否在规范中。）如果您有以下代码： int arr[] = {1, 2 ,3, 4, 5}; int* lookupTable = arr + 3; printf("%i", lookupTable[-2]); 它将打印出2 这是因为c中的数组定义为指针。如果指针不指向数组的开头，则可以访问指针之前的项请记住，如果必须malloc（）存储arr 的内存，则可能无法使用free（lookupTable）来释放它。生成具有动态范围值的表相当容易下面是一个简单的单表方法： #include <malloc.h> #define VARIABLE_USED(_sym) \ do { \ if (1) \ break; \ if (!! _sym) \ break; \ } while (0) double *table_of_values; int table_bias; // use the smallest of these that can contain the values the x array may have #if 0 typedef int xval_t; #endif #if 0 typedef short xval_t; #endif #if 1 typedef char xval_t; #endif #define XLEN (1 << 9) xval_t *x; // fslow -- your original function double fslow(int i) { return 1; // whatever } // ftablegen -- generate variable table void ftablegen(double (*f)(int),int lo,int hi) { int len; table_bias = -lo; len = hi - lo; len += 1; // NOTE: you can do free(table_of_values) when no longer needed table_of_values = malloc(sizeof(double) * len); for (int i = lo; i <= hi; ++i) table_of_values[i + table_bias] = f(i); } // fcached -- retrieve cached table data double fcached(int i) { return table_of_values[i + table_bias]; } // fripper -- access x and table arrays void fripper(xval_t *x) { double *tptr; int bias; double val; // ensure these go into registers to prevent needless extra memory fetches tptr = table_of_values; bias = table_bias; for (int i = 0; i < XLEN; ++i) { val = tptr[x[i] + bias]; // do stuff with val VARIABLE_USED(val); } } int main(void) { ftablegen(fslow,-10,10); x = malloc(sizeof(xval_t) * XLEN); fripper(x); return 0; } fripper 是最快的，因为它避免了在每次循环迭代中重新绘制全局值表的值和表的偏差。在fripper中，编译器优化器将确保它们保留在寄存器中。请参阅我的答案：为什么然而，我编写了一个使用fcached 的fripper 变体，反汇编的代码是相同的[而且是最优的].所以，我们可以忽略这一点…或者，我们可以吗？有时候，反汇编代码是一个很好的交叉检查，也是唯一可以确定的方法。在创建完全优化的C代码时，这只是一个额外的项目。在代码生成方面，可以给编译器很多选择，所以有时候只是尝试和错误因为基准测试很重要，我加入了我的时间戳例程（仅供参考，[AFAIK]底层的clock\u gettime 调用是python的time.clock（）的基础）以下是最新版本： #include <malloc.h> #include <time.h> typedef long long s64; #define SUPER_INLINE \ __attribute__((__always_inline__)) static inline #define VARIABLE_USED(_sym) \ do { \ if (1) \ break; \ if (!! _sym) \ break; \ } while (0) #define TVSEC 1000000000LL // nanoseconds in a second #define TVSECF 1e9 // nanoseconds in a second // tvget -- get high resolution time of day // RETURNS: absolute nanoseconds s64 tvget(void) { struct timespec ts; s64 nsec; clock_gettime(CLOCK_REALTIME,&ts); nsec = ts.tv_sec; nsec *= TVSEC; nsec += ts.tv_nsec; return nsec; ) // tvgetf -- get high resolution time of day // RETURNS: fractional seconds double tvgetf(void) { struct timespec ts; double sec; clock_gettime(CLOCK_REALTIME,&ts); sec = ts.tv_nsec; sec /= TVSECF; sec += ts.tv_sec; return sec; ) double *table_of_values; int table_bias; double *dummyptr; // use the smallest of these that can contain the values the x array may have #if 0 typedef int xval_t; #endif #if 0 typedef short xval_t; #endif #if 1 typedef char xval_t; #endif #define XLEN (1 << 9) xval_t *x; // fslow -- your original function double fslow(int i) { return 1; // whatever } // ftablegen -- generate variable table void ftablegen(double (*f)(int),int lo,int hi) { int len; table_bias = -lo; len = hi - lo; len += 1; // NOTE: you can do free(table_of_values) when no longer needed table_of_values = malloc(sizeof(double) * len); for (int i = lo; i <= hi; ++i) table_of_values[i + table_bias] = f(i); } // fcached -- retrieve cached table data SUPER_INLINE double fcached(int i) { return table_of_values[i + table_bias]; } // fripper_fcached -- access x and table arrays void fripper_fcached(xval_t *x) { double val; double *dptr; dptr = dummyptr; for (int i = 0; i < XLEN; ++i) { val = fcached(x[i]); // do stuff with val dptr[i] = val; } } // fripper -- access x and table arrays void fripper(xval_t *x) { double *tptr; int bias; double val; double *dptr; // ensure these go into registers to prevent needless extra memory fetches tptr = table_of_values; bias = table_bias; dptr = dummyptr; for (int i = 0; i < XLEN; ++i) { val = tptr[x[i] + bias]; // do stuff with val dptr[i] = val; } } int main(void) { ftablegen(fslow,-10,10); x = malloc(sizeof(xval_t) * XLEN); dummyptr = malloc(sizeof(double) * XLEN); fripper(x); fripper_fcached(x); return 0; } #包括 #包括 typedef long-long s64； #定义SUPER_内联\ __属性uuu（（uuu始终u内联uuu））静态内联 #定义使用的变量（符号）\ 做{\ 如果（1）\ 中断\ 如果（！！_-sym）\ 中断\ }而（0） #每秒定义TVSEC 1000000000LL//纳秒 #定义TVSECF 1e9//纳秒秒 //tvget--获取一天中的高分辨率时间 //返回：绝对纳秒 s64 tvget（无效） { 结构timespects； s64 nsec；时钟获取时间（时钟实时，&ts）； nsec=ts.tv_秒； nsec*=TVSEC； nsec+=ts.tv\U nsec；返回nsec； ) //tvgetf--获取一天中的高分辨率时间 //返回：小数秒双重的 tvgetf（无效） { 结构timespects；双秒；时钟获取时间（时钟实时，&ts）； sec=ts.tv\u nsec； sec/=TVSECF；秒+=ts.tv_秒；返回秒； ) _值的双*表_； int表_偏差；双*dummyptr； //使用其中最小的一个，它可以包含x数组可能具有的值 #如果0 typedef int xval\t； #恩迪夫 #如果0 typedef short xval\t； #恩迪夫 #如果1 typedef char xval\t； #恩迪夫 #定义XLEN（1我真的认为Craig Estey以自动方式构建表格是正确的。我只想为查找表格添加一个注释如果您知道您将在Haswell机器上运行代码（使用AVX2），那么您应该确保您的代码使用VGATHERDPD ，您可以使用\u mm256\u i32Gatherd\u pd 固有功能来使用它。如果您这样做，您的表查找将运行！（您甚至可以使用cpuid（）动态检测AVX2，但这是另一回事）编辑：让我用一些代码来详细说明： #include <stdint.h> #include <stdio.h> #include <immintrin.h> /* I'm not sure if you need the alignment */ double table[8] __attribute__((aligned(16)))= { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 }; int main() { int32_t i[4] = { 0,2,4,6 }; __m128i index = _mm_load_si128( (__m128i*) i ); __m256d result = _mm256_i32gather_pd( table, index, 8 ); double* f = (double*)&result; printf("%f %f %f %f\n", f[0], f[1], f[2], f[3]); return 0; } 这太快了！数组不能有负索引。只需标准化为arr[可能是负索引+20] Ad 1）x 的大小与查找表的大小有什么关系？x将使用千兆字节的内存，而查找表只有100字节左右。Ad 2）是-您可以编写python脚本来生成C源代码（；-）)，或者，malloc 查找表并在-20..20上循环，调用f。一个小建议：为什么不先在python中实现查找表，看看它是否在速度上有所不同。更可能的是数据集的大小（10亿倍，使用4或8Gb的ram）这就是问题所在。你有多少内存？LUT方法最终会在精度和表大小之间进行权衡。这是常见的做法 #include <malloc.h> #include <time.h> typedef long long s64; #define SUPER_INLINE \ __attribute__((__always_inline__)) static inline #define VARIABLE_USED(_sym) \ do { \ if (1) \ break; \ if (!! _sym) \ break; \ } while (0) #define TVSEC 1000000000LL // nanoseconds in a second #define TVSECF 1e9 // nanoseconds in a second // tvget -- get high resolution time of day // RETURNS: absolute nanoseconds s64 tvget(void) { struct timespec ts; s64 nsec; clock_gettime(CLOCK_REALTIME,&ts); nsec = ts.tv_sec; nsec *= TVSEC; nsec += ts.tv_nsec; return nsec; ) // tvgetf -- get high resolution time of day // RETURNS: fractional seconds double tvgetf(void) { struct timespec ts; double sec; clock_gettime(CLOCK_REALTIME,&ts); sec = ts.tv_nsec; sec /= TVSECF; sec += ts.tv_sec; return sec; ) double *table_of_values; int table_bias; double *dummyptr; // use the smallest of these that can contain the values the x array may have #if 0 typedef int xval_t; #endif #if 0 typedef short xval_t; #endif #if 1 typedef char xval_t; #endif #define XLEN (1 << 9) xval_t *x; // fslow -- your original function double fslow(int i) { return 1; // whatever } // ftablegen -- generate variable table void ftablegen(double (*f)(int),int lo,int hi) { int len; table_bias = -lo; len = hi - lo; len += 1; // NOTE: you can do free(table_of_values) when no longer needed table_of_values = malloc(sizeof(double) * len); for (int i = lo; i <= hi; ++i) table_of_values[i + table_bias] = f(i); } // fcached -- retrieve cached table data SUPER_INLINE double fcached(int i) { return table_of_values[i + table_bias]; } // fripper_fcached -- access x and table arrays void fripper_fcached(xval_t *x) { double val; double *dptr; dptr = dummyptr; for (int i = 0; i < XLEN; ++i) { val = fcached(x[i]); // do stuff with val dptr[i] = val; } } // fripper -- access x and table arrays void fripper(xval_t *x) { double *tptr; int bias; double val; double *dptr; // ensure these go into registers to prevent needless extra memory fetches tptr = table_of_values; bias = table_bias; dptr = dummyptr; for (int i = 0; i < XLEN; ++i) { val = tptr[x[i] + bias]; // do stuff with val dptr[i] = val; } } int main(void) { ftablegen(fslow,-10,10); x = malloc(sizeof(xval_t) * XLEN); dummyptr = malloc(sizeof(double) * XLEN); fripper(x); fripper_fcached(x); return 0; } #include <stdint.h> #include <stdio.h> #include <immintrin.h> /* I'm not sure if you need the alignment */ double table[8] __attribute__((aligned(16)))= { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 }; int main() { int32_t i[4] = { 0,2,4,6 }; __m128i index = _mm_load_si128( (__m128i*) i ); __m256d result = _mm256_i32gather_pd( table, index, 8 ); double* f = (double*)&result; printf("%f %f %f %f\n", f[0], f[1], f[2], f[3]); return 0; } $ gcc --std=gnu99 -mavx2 gathertest.c -o gathertest && ./gathertest 0.100000 0.300000 0.500000 0.700000