C# C语言中的数学优化#_C#_Optimization_Neural Network_Performance

C# C语言中的数学优化#

c# optimization neural-network performance

C# C语言中的数学优化#,c#,optimization,neural-network,performance,C#,Optimization,Neural Network,Performance,我整天都在分析一个应用程序，在优化了几段代码之后，我的待办事项列表上就剩下这个了。这是一个神经网络的激活函数，它被调用超过1亿次。根据dotTrace，它大约占整个功能时间的60% 您将如何对此进行优化 public static float Sigmoid(double value) { return (float) (1.0 / (1.0 + Math.Pow(Math.E, -value))); } 想法：也许您可以使用预先计算的值制作一个（大）查找表？试试： public st

我整天都在分析一个应用程序，在优化了几段代码之后，我的待办事项列表上就剩下这个了。这是一个神经网络的激活函数，它被调用超过1亿次。根据dotTrace，它大约占整个功能时间的60%

您将如何对此进行优化

public static float Sigmoid(double value) {
    return (float) (1.0 / (1.0 + Math.Pow(Math.E, -value)));
}

想法：也许您可以使用预先计算的值制作一个（大）查找表？

试试：

public static float Sigmoid(double value) {
    return 1.0f / (1.0f + (float) Math.Exp(-value));
}

编辑：我做了一个快速基准测试。在我的机器上，上面的代码比你的方法快43%，这个数学上等价的代码是最快的（比原始代码快46%）：

编辑2:我不确定C#函数的开销有多大，但如果在源代码中包含，应该可以使用它，它使用float exp函数。可能会快一点

public static float Sigmoid(double value) {
    float k = expf((float) value);
    return k / (1.0f + k);
}

此外，如果您正在进行数百万次调用，那么函数调用开销可能是一个问题。尝试创建一个内联函数，看看是否有帮助。

在1亿次调用中，我开始怀疑分析器开销是否会影响结果。将计算替换为无操作，然后查看是否仍报告它消耗了60%的执行时间

或者更好的方法是，创建一些测试数据，并使用秒表计时器分析大约一百万个调用

在谷歌搜索中，我找到了Sigmoid函数的替代实现

public double Sigmoid(double x)
{
   return 2 / (1 + Math.Exp(-2 * x)) - 1;
}

这符合你的需要吗？速度快吗

如果是针对激活函数，那么如果e^x的计算完全准确，这是否非常重要

例如，如果使用近似值（1+x/256）^256，在我的奔腾Java测试中（我假设C基本上编译为相同的处理器指令），这比e^x（Math.exp（））快7-8倍，精确到小数点后2位，最多为+/-1.5的x，在你所说的范围内的正确数量级内。（显然，要提高到256，实际上需要将数字平方8次——不要使用Math.Pow来实现这一点！）在Java中：

根据您希望的近似精度，将256倍或减半（以及添加/删除乘法）。即使n=4，对于-0.5和0.5之间的x值，它仍然给出了大约1.5个小数位的精度（看起来比Math.exp（）快15倍）

顺便说一下，我忘了提到-，你显然不应该真的除以256：乘以常数1/256。Java的JIT编译器会自动进行这种优化（至少Hotspot是这样），我假设C也必须这样做。

首先想到的是：关于values变量的一些统计数据如何

“value”的值是否通常很小-10（使用性能测量更新）（使用实际结果再次更新：）

我认为，在性能方面，查找表解决方案可以让您走得更远，而内存和精度方面的成本可以忽略不计

下面的代码片段是C语言的一个示例实现（我说C语言不够流利，无法对其进行编译）。它运行和性能都很好，但我确信其中有一个bug：）

待办事项：

有需要改进的地方和消除弱点的方法；如何做是留给读者作为练习：）

调整函数的范围，以避免表开始和结束的跳转
添加一个轻微的噪波函数以隐藏锯齿瑕疵
正如Rex所说，插值可以使您获得更高的精度，同时具有更便宜的性能

public static float Sigmoid(double value) 
{
    float k = Math.Exp(value);
    return k / (1.0f + k);
}

f(x) = (3x - x**3)/2

f(x) = x*(3 - x*x)/2

请记住，此激活函数中的任何更改都是以不同行为为代价的。这甚至包括切换到浮动（从而降低精度）或使用激活替代物。只有对您的用例进行实验，才能找到正确的方法

<> LI>除了简单的代码优化之外，我还建议考虑< <强> >计算的强>并行（即：利用多个核O）
if(sigmoidCache.containsKey(value)) return sigmoidCache.get(value);

#include <math.h> #include <stdio.h> #include <time.h> #define SCALE 320.0f #define RESOLUTION 2047 #define MIN -RESOLUTION / SCALE #define MAX RESOLUTION / SCALE static float sigmoid_lut[RESOLUTION + 1]; void init_sigmoid_lut(void) { int i; for (i = 0; i < RESOLUTION + 1; i++) { sigmoid_lut[i] = (1.0 / (1.0 + exp(-i / SCALE))); } } static float sigmoid1(const float value) { return (1.0f / (1.0f + expf(-value))); } static float sigmoid2(const float value) { if (value <= MIN) return 0.0f; if (value >= MAX) return 1.0f; if (value >= 0) return sigmoid_lut[(int)(value * SCALE + 0.5f)]; return 1.0f-sigmoid_lut[(int)(-value * SCALE + 0.5f)]; } float test_error() { float x; float emax = 0.0; for (x = -10.0f; x < 10.0f; x+=0.00001f) { float v0 = sigmoid1(x); float v1 = sigmoid2(x); float error = fabsf(v1 - v0); if (error > emax) { emax = error; } } return emax; } int sigmoid1_perf() { clock_t t0, t1; int i; float x, y = 0.0f; t0 = clock(); for (i = 0; i < 10; i++) { for (x = -5.0f; x <= 5.0f; x+=0.00001f) { y = sigmoid1(x); } } t1 = clock(); printf("", y); /* To avoid sigmoidX() calls being optimized away */ return (t1 - t0) / (CLOCKS_PER_SEC / 1000); } int sigmoid2_perf() { clock_t t0, t1; int i; float x, y = 0.0f; t0 = clock(); for (i = 0; i < 10; i++) { for (x = -5.0f; x <= 5.0f; x+=0.00001f) { y = sigmoid2(x); } } t1 = clock(); printf("", y); /* To avoid sigmoidX() calls being optimized away */ return (t1 - t0) / (CLOCKS_PER_SEC / 1000); } int main(void) { init_sigmoid_lut(); printf("Max deviation is %0.6f\n", test_error()); printf("10^7 iterations using sigmoid1: %d ms\n", sigmoid1_perf()); printf("10^7 iterations using sigmoid2: %d ms\n", sigmoid2_perf()); return 0; }

$ gcc -O2 test.c -o test && ./test Max deviation is 0.001664 10^7 iterations using sigmoid1: 571 ms 10^7 iterations using sigmoid2: 113 ms

public static float Sigmoid(double value) { float k = Math.Exp(value); return k / (1.0f + k); }

f(x) = (3x - x**3)/2

f(x) = x*(3 - x*x)/2

void sigmoid_sse(float *a_Values, float *a_Output, size_t a_Size){ __m128* l_Output = (__m128*)a_Output; __m128* l_Start = (__m128*)a_Values; __m128* l_End = (__m128*)(a_Values + a_Size); const __m128 l_One = _mm_set_ps1(1.f); const __m128 l_Half = _mm_set_ps1(1.f / 2.f); const __m128 l_OneOver6 = _mm_set_ps1(1.f / 6.f); const __m128 l_OneOver24 = _mm_set_ps1(1.f / 24.f); const __m128 l_OneOver120 = _mm_set_ps1(1.f / 120.f); const __m128 l_OneOver720 = _mm_set_ps1(1.f / 720.f); const __m128 l_MinOne = _mm_set_ps1(-1.f); for(__m128 *i = l_Start; i < l_End; i++){ // 1.0 / (1.0 + Math.Pow(Math.E, -value)) // 1.0 / (1.0 + Math.Exp(-value)) // value = *i so we need -value __m128 value = _mm_mul_ps(l_MinOne, *i); // exp expressed as inifite series 1 + x + (x ^ 2 / 2!) + (x ^ 3 / 3!) ... __m128 x = value; // result in l_Exp __m128 l_Exp = l_One; // = 1 l_Exp = _mm_add_ps(l_Exp, x); // += x x = _mm_mul_ps(x, x); // = x ^ 2 l_Exp = _mm_add_ps(l_Exp, _mm_mul_ps(l_Half, x)); // += (x ^ 2 * (1 / 2)) x = _mm_mul_ps(value, x); // = x ^ 3 l_Exp = _mm_add_ps(l_Exp, _mm_mul_ps(l_OneOver6, x)); // += (x ^ 3 * (1 / 6)) x = _mm_mul_ps(value, x); // = x ^ 4 l_Exp = _mm_add_ps(l_Exp, _mm_mul_ps(l_OneOver24, x)); // += (x ^ 4 * (1 / 24)) #ifdef MORE_ACCURATE x = _mm_mul_ps(value, x); // = x ^ 5 l_Exp = _mm_add_ps(l_Exp, _mm_mul_ps(l_OneOver120, x)); // += (x ^ 5 * (1 / 120)) x = _mm_mul_ps(value, x); // = x ^ 6 l_Exp = _mm_add_ps(l_Exp, _mm_mul_ps(l_OneOver720, x)); // += (x ^ 6 * (1 / 720)) #endif // we've calculated exp of -i // now we only need to do the '1.0 / (1.0 + ...' part *l_Output++ = _mm_rcp_ps(_mm_add_ps(l_One, l_Exp)); } }

public static float Sigmoid(double value) { float v = value; float k = Math.Exp(v); return k / (1.0f + k); }

public static double Exp(double val) { long tmp = (long) (1512775 * val + 1072632447); return BitConverter.Int64BitsToDouble(tmp << 32); }

$ gmcs -optimize test.cs && mono test.exe Max deviation is 0.001663983 10^7 iterations using Sigmoid1() took 1646.613 ms 10^7 iterations using Sigmoid2() took 237.352 ms

using System; using System.Diagnostics; class LUTTest { private const float SCALE = 320.0f; private const int RESOLUTION = 2047; private const float MIN = -RESOLUTION / SCALE; private const float MAX = RESOLUTION / SCALE; private static readonly float[] lut = InitLUT(); private static float[] InitLUT() { var lut = new float[RESOLUTION + 1]; for (int i = 0; i < RESOLUTION + 1; i++) { lut[i] = (float)(1.0 / (1.0 + Math.Exp(-i / SCALE))); } return lut; } public static float Sigmoid1(double value) { return (float) (1.0 / (1.0 + Math.Exp(-value))); } public static float Sigmoid2(float value) { if (value <= MIN) return 0.0f; if (value >= MAX) return 1.0f; if (value >= 0) return lut[(int)(value * SCALE + 0.5f)]; return 1.0f - lut[(int)(-value * SCALE + 0.5f)]; } public static float error(float v0, float v1) { return Math.Abs(v1 - v0); } public static float TestError() { float emax = 0.0f; for (float x = -10.0f; x < 10.0f; x+= 0.00001f) { float v0 = Sigmoid1(x); float v1 = Sigmoid2(x); float e = error(v0, v1); if (e > emax) emax = e; } return emax; } public static double TestPerformancePlain() { Stopwatch sw = new Stopwatch(); sw.Start(); for (int i = 0; i < 10; i++) { for (float x = -5.0f; x < 5.0f; x+= 0.00001f) { Sigmoid1(x); } } sw.Stop(); return sw.Elapsed.TotalMilliseconds; } public static double TestPerformanceLUT() { Stopwatch sw = new Stopwatch(); sw.Start(); for (int i = 0; i < 10; i++) { for (float x = -5.0f; x < 5.0f; x+= 0.00001f) { Sigmoid2(x); } } sw.Stop(); return sw.Elapsed.TotalMilliseconds; } static void Main() { Console.WriteLine("Max deviation is {0}", TestError()); Console.WriteLine("10^7 iterations using Sigmoid1() took {0} ms", TestPerformancePlain()); Console.WriteLine("10^7 iterations using Sigmoid2() took {0} ms", TestPerformanceLUT()); } }

#light let Scale = 320.0f; let Resolution = 2047; let Min = -single(Resolution)/Scale; let Max = single(Resolution)/Scale; let range step a b = let count = int((b-a)/step); seq { for i in 0 .. count -> single(i)*step + a }; let lut = [| for x in 0 .. Resolution -> single(1.0/(1.0 + exp(-double(x)/double(Scale)))) |] let sigmoid1 value = 1.0f/(1.0f + exp(-value)); let sigmoid2 v = if (v <= Min) then 0.0f; elif (v>= Max) then 1.0f; else let f = v * Scale; if (v>0.0f) then lut.[int (f + 0.5f)] else 1.0f - lut.[int(0.5f - f)]; let getError f = let test = range 0.00001f -10.0f 10.0f; let errors = seq { for v in test -> abs(sigmoid1(single(v)) - f(single(v))) } Seq.max errors; open System.Diagnostics; let test f = let sw = Stopwatch.StartNew(); let mutable m = 0.0f; let result = for t in 1 .. 10 do for x in 1 .. 1000000 do m <- f(single(x)/100000.0f-5.0f); sw.Elapsed.TotalMilliseconds; printf "Max deviation is %f\n" (getError sigmoid2) printf "10^7 iterations using sigmoid1: %f ms\n" (test sigmoid1) printf "10^7 iterations using sigmoid2: %f ms\n" (test sigmoid2) let c = System.Console.ReadKey(true);

Max deviation is 0.001664 10^7 iterations using sigmoid1: 588.843700 ms 10^7 iterations using sigmoid2: 156.626700 ms

Max deviation is 0.001664 10^7 iterations using sigmoid1: 628 ms 10^7 iterations using sigmoid2: 157 ms

$ javac LUTTest.java && java LUTTest Max deviation is 0.001664 10^7 iterations using sigmoid1() took 1398 ms 10^7 iterations using sigmoid2() took 177 ms

public class LUTTest { private static final float SCALE = 320.0f; private static final int RESOLUTION = 2047; private static final float MIN = -RESOLUTION / SCALE; private static final float MAX = RESOLUTION / SCALE; private static final float[] lut = initLUT(); private static float[] initLUT() { float[] lut = new float[RESOLUTION + 1]; for (int i = 0; i < RESOLUTION + 1; i++) { lut[i] = (float)(1.0 / (1.0 + Math.exp(-i / SCALE))); } return lut; } public static float sigmoid1(double value) { return (float) (1.0 / (1.0 + Math.exp(-value))); } public static float sigmoid2(float value) { if (value <= MIN) return 0.0f; if (value >= MAX) return 1.0f; if (value >= 0) return lut[(int)(value * SCALE + 0.5f)]; return 1.0f - lut[(int)(-value * SCALE + 0.5f)]; } public static float error(float v0, float v1) { return Math.abs(v1 - v0); } public static float testError() { float emax = 0.0f; for (float x = -10.0f; x < 10.0f; x+= 0.00001f) { float v0 = sigmoid1(x); float v1 = sigmoid2(x); float e = error(v0, v1); if (e > emax) emax = e; } return emax; } public static long sigmoid1Perf() { float y = 0.0f; long t0 = System.currentTimeMillis(); for (int i = 0; i < 10; i++) { for (float x = -5.0f; x < 5.0f; x+= 0.00001f) { y = sigmoid1(x); } } long t1 = System.currentTimeMillis(); System.out.printf("",y); return t1 - t0; } public static long sigmoid2Perf() { float y = 0.0f; long t0 = System.currentTimeMillis(); for (int i = 0; i < 10; i++) { for (float x = -5.0f; x < 5.0f; x+= 0.00001f) { y = sigmoid2(x); } } long t1 = System.currentTimeMillis(); System.out.printf("",y); return t1 - t0; } public static void main(String[] args) { System.out.printf("Max deviation is %f\n", testError()); System.out.printf("10^7 iterations using sigmoid1() took %d ms\n", sigmoid1Perf()); System.out.printf("10^7 iterations using sigmoid2() took %d ms\n", sigmoid2Perf()); } }
Empty Function: 79ms 0 Original: 1576ms 0.7202294 Simplified: (soprano) 681ms 0.7202294 Approximate: (Neil) 441ms 0.7198783 Bit Manip: (martinus) 836ms 0.72318 Taylor: (Rex Logan) 261ms 0.7202305 Lookup: (Henrik) 182ms 0.7204863
public static object[] Time(Func<double, float> f) { var testvalue = 0.9456; var sw = new Stopwatch(); sw.Start(); for (int i = 0; i < 1e7; i++) f(testvalue); return new object[] { sw.ElapsedMilliseconds, f(testvalue) }; } public static void Main(string[] args) { Console.WriteLine("Empty: {0,10}ms {1}", Time(Empty)); Console.WriteLine("Original: {0,10}ms {1}", Time(Original)); Console.WriteLine("Simplified: {0,10}ms {1}", Time(Simplified)); Console.WriteLine("Approximate: {0,10}ms {1}", Time(ExpApproximation)); Console.WriteLine("Bit Manip: {0,10}ms {1}", Time(BitBashing)); Console.WriteLine("Taylor: {0,10}ms {1}", Time(TaylorExpansion)); Console.WriteLine("Lookup: {0,10}ms {1}", Time(LUT)); }

public static double Sigmoid(double value) { return 0.5d + 0.5d * Math.Tanh(value/2); }

public double Sigmoid(double value) { if (value < -45.0) return 0.0; if (value > 45.0) return 1.0; return 1.0 / (1.0 + Math.Exp(-value)); }