如何在Delphi中快速将值“钳制”到范围内?

如何在Delphi中快速将值“钳制”到范围内?,delphi,optimization,signal-processing,Delphi,Optimization,Signal Processing,我有许多样本处理原语,如: function Add8(A, B: Byte): Byte; {$IFDEF CODEINLINING}inline;{$ENDIF} begin Result := A + B; end; function Sub16(A, B: Word): Word; {$IFDEF CODEINLINING}inline;{$ENDIF} begin Result := A - B; end; { et cetera } 这些函数是数据处理的工作马,每个输入

我有许多样本处理原语,如:

function Add8(A, B: Byte): Byte; {$IFDEF CODEINLINING}inline;{$ENDIF}
begin
  Result := A + B;
end;

function Sub16(A, B: Word): Word; {$IFDEF CODEINLINING}inline;{$ENDIF}
begin
  Result := A - B;
end;

{ et cetera }
这些函数是数据处理的工作马,每个输入样本都要调用它们。按设计,结果类型必须与参数操作数大小相同

当操作结果超出定义的LowResult..HighResult范围时,就会出现问题,截断最高有效位并有效地使结果不正确。例如: 将低值添加到峰值Add8240,22可消除峰值,最好是255。 为了减去接近基线水平的两个值(低于1632000、33000),我最好选择0


我的问题是:如何让这样的操作以性能方式将结果值钳制在范围内?对于所有算术和所有基类型8位、16位、无符号、有符号,是否有一个通用的解决方案?

因为您处理的是大数据处理,我建议尝试一些汇编程序-MMX、SSE2命令专门用于此类任务。例如,paddusb指令可以一次添加16对字节,结果为字节范围。不要忘记正确对齐数据块

示例未针对32位编译器进行彻底测试。它的工作速度比pascal版本快9倍,用于处理256M阵列604对5100毫秒,重复10次。请注意,对于合理的数据大小,pascal版本也非常快

program Project1;
{$APPTYPE CONSOLE}
uses SysUtils;

procedure AddBytesSat(const A, B, Res: PByteArray; Len: Integer);
//adds byte arrays Res[i] := A[i] + B[i] with saturation
//arrays should be aligned to 16-byte border, length divisible by 16
//three parameters in eax, edx, ecx registers, fourth on the stack
asm
  push esi
  mov esi, ecx // save Res pointer
  mov ecx, Len
  shr ecx, 4   // Len div 16
@@start:
  movdqa xmm0, [eax]  //copies 16 bytes (aligned) to sse register

  paddusb xmm0, [edx] // adds 16 unsigned values with saturation

  movdqa [esi], xmm0  // move result bytes back to memory
  add eax, 16  // move array pointers
  add edx, 16
  add esi, 16
  loop @@start  //go to next iteration 
  pop esi
end;

var
  A, B, C: PByteArray;
  i: integer;
begin
  //ensure that memory manager returns properly aligned blocks
  SetMinimumBlockAlignment(System.mba16Byte);

  GetMem(A, 32);
  GetMem(B, 32);
  GetMem(C, 32);

  for i := 0 to 31 do begin
    A[i] := 8 * i;
    B[i] := 200;
  end;

  AddBytesSat(A, B, C, 32);

  //clamping demonstration
  for i := 0 to 15 do
    Writeln(C[i]);

  Readln;
end.

因为您处理的是大型数据处理,所以我建议您尝试一些汇编程序MMX,SSE2命令专门用于此类任务。例如,paddusb指令可以一次添加16对字节,结果为字节范围。不要忘记正确对齐数据块

示例未针对32位编译器进行彻底测试。它的工作速度比pascal版本快9倍,用于处理256M阵列604对5100毫秒,重复10次。请注意,对于合理的数据大小,pascal版本也非常快

program Project1;
{$APPTYPE CONSOLE}
uses SysUtils;

procedure AddBytesSat(const A, B, Res: PByteArray; Len: Integer);
//adds byte arrays Res[i] := A[i] + B[i] with saturation
//arrays should be aligned to 16-byte border, length divisible by 16
//three parameters in eax, edx, ecx registers, fourth on the stack
asm
  push esi
  mov esi, ecx // save Res pointer
  mov ecx, Len
  shr ecx, 4   // Len div 16
@@start:
  movdqa xmm0, [eax]  //copies 16 bytes (aligned) to sse register

  paddusb xmm0, [edx] // adds 16 unsigned values with saturation

  movdqa [esi], xmm0  // move result bytes back to memory
  add eax, 16  // move array pointers
  add edx, 16
  add esi, 16
  loop @@start  //go to next iteration 
  pop esi
end;

var
  A, B, C: PByteArray;
  i: integer;
begin
  //ensure that memory manager returns properly aligned blocks
  SetMinimumBlockAlignment(System.mba16Byte);

  GetMem(A, 32);
  GetMem(B, 32);
  GetMem(C, 32);

  for i := 0 to 31 do begin
    A[i] := 8 * i;
    B[i] := 200;
  end;

  AddBytesSat(A, B, C, 32);

  //clamping demonstration
  for i := 0 to 15 do
    Writeln(C[i]);

  Readln;
end.

如果要钳制的值刚好高于255或低于零,则可以使用查找表。这样的查找表方法非常快,只要表适合一级缓存中常见的值。编译器将字节_-clamp_-lut+256转换为表中间的单个内存引用。我不知道Pascal/Delphi是否允许使用宏,但在C/C++中是这样做的:

#define byte_clamp(v) ((uint8_t*)(byte_clamp_lut+256))[v]

static uint8_t byte_clamp768[] = {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
  144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
  160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
  176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
  192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
  208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
  224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
  240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
  256,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
};

如果要钳制的值刚好高于255或低于零,则可以使用查找表。这样的查找表方法非常快,只要表适合一级缓存中常见的值。编译器将字节_-clamp_-lut+256转换为表中间的单个内存引用。我不知道Pascal/Delphi是否允许使用宏,但在C/C++中是这样做的:

#define byte_clamp(v) ((uint8_t*)(byte_clamp_lut+256))[v]

static uint8_t byte_clamp768[] = {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
  144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
  160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
  176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
  192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
  208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
  224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
  240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
  256,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
  255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
};

您是否有单一值,或者您正在对数组进行操作?@DavidHeffernan,正在对单一值进行操作。显然,在这个抽象层次的背后有数组。好吧,你所问问题的答案可能不是你问题的最佳解决方案。解决问题的最佳方法可能是在阵列上操作。我现在删除的答案从表面上回答了你的问题,但@MBo指出了一个更好的方法来解决你的根本问题。@DavidHeffernan不幸的是,我错过了你的答案。本质上是什么?结果:=保证范围整数+整数B,低A,高A;但您希望编写自己的内联版本的EnsureRange。您是否有单个值,或者正在对数组进行操作?@DavidHeffernan,正在对单个值进行操作。显然,在这个抽象层次的背后有数组。好吧,你所问问题的答案可能不是你问题的最佳解决方案。解决问题的最佳方法可能是在阵列上操作。我现在删除的答案从表面上回答了你的问题,但@MBo指出了一个更好的方法来解决你的根本问题。@DavidHeffernan不幸的是,我错过了你的答案。本质上是什么?结果:=保证范围整数+整数B,低A,高A;但您希望编写自己的内联版本的EnsureRange。