C 是否有标准宏来检测需要对齐内存访问的体系结构?

C 是否有标准宏来检测需要对齐内存访问的体系结构?,c,c-preprocessor,memory-alignment,C,C Preprocessor,Memory Alignment,假设如下: void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len) { unsigned int i; for(i=0; i<len; i++) { dest[i] = src[i] & mask[i]; } } void mask_bytes(unsigned char* dest, unsigned char* sr

假设如下:

void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
  unsigned int i;
  for(i=0; i<len; i++)
  {
     dest[i] = src[i] & mask[i];
  }
}
void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
  unsigned int i;
  unsigned int wordlen = len >> 2;
  for(i=0; i<wordlen; i++)
  {
    ((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i]; // this raises SIGBUS on SPARC and other archs that require aligned access.
  }
  for(i=wordlen<<2; i<len; i++){
    dest[i] = src[i] & mask[i];
  }
}
void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
  unsigned int i;
  unsigned int wordlen = len >> 2;

#if defined(__ALIGNED2__) || defined(__ALIGNED4__) || defined(__ALIGNED8__)
  // go slow
  for(i=0; i<len; i++)
  {
     dest[i] = src[i] & mask[i];
  }
#else
  // go fast
  for(i=0; i<wordlen; i++)
  {
    // the following line will raise SIGBUS on SPARC and other archs that require aligned access.
    ((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i]; 
  }
  for(i=wordlen<<2; i<len; i++){
    dest[i] = src[i] & mask[i];
  }
#endif
}
void mask_字节(unsigned char*dest、unsigned char*src、unsigned char*mask、unsigned int len)
{
无符号整数i;
对于(i=0;i>2;

对于(i=0;i而言,当x86以静默方式修复未对齐的访问时,这对性能来说并不是最优的。通常最好假设某种对齐方式并自己执行修复:

unsigned int const alignment = 8;   /* or 16, or sizeof(long) */

void memcpy(char *dst, char const *src, unsigned int size) {
    if((((intptr_t)dst) % alignment) != (((intptr_t)src) % alignment)) {
        /* no common alignment, copy as bytes or shift around */
    } else {
        if(((intptr_t)dst) % alignment) {
            /* copy bytes at the beginning */
        }
        /* copy words in the middle */
        if(((intptr_t)dst + size) % alignment) {
            /* copy bytes at the end */
        }
    }
}

另外,请查看SIMD说明。

标准方法是使用
配置
脚本来运行程序以测试对齐问题。如果测试程序没有崩溃,则配置脚本会在生成的配置标头中定义一个宏,以便更快地实现。更安全的实现是def奥尔特

void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
  unsigned int i;
  unsigned int wordlen = len >> 2;

#if defined(UNALIGNED)
  // go fast
  for(i=0; i<wordlen; i++)
  {
    // the following line will raise SIGBUS on SPARC and other archs that require aligned access.
    ((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i]; 
  }
  for(i=wordlen<<2; i<len; i++){
    dest[i] = src[i] & mask[i];
  }
#else
  // go slow
  for(i=0; i<len; i++)
  {
     dest[i] = src[i] & mask[i];
  }
#endif
}
void mask_字节(unsigned char*dest、unsigned char*src、unsigned char*mask、unsigned int len)
{
无符号整数i;
无符号整数字len=len>>2;
#如果已定义(未对齐)
//快走
对于(i=0;i(我觉得当这些通勤时,你有
src
mask
很奇怪。我把
mask\u bytes
重命名为
memand
。但无论如何…)

另一种选择是使用利用C中类型的不同函数。例如:

void memand_bytes(char *dest, char *src1, char *src2, size_t len)
{
    unsigned int i;
    for (i = 0; i < len; i++)
        dest[i] = src1[i] & src2[i];
}

void memand_ints(int *dest, int *src1, int *src2, size_t len)
{
    unsigned int i;
    for (i = 0; i < len; i++)
        dest[i] = src1[i] & src2[i];
}
void memand\u字节(char*dest,char*src1,char*src2,size\t len)
{
无符号整数i;
对于(i=0;i

这样你就可以让程序员来决定了。

CPU会进行额外的循环,以获取未对齐的数据并将其移动到正确的位置。通常这会比对齐的速度慢很多。你应该总是尝试读取对齐的数据……就像我说的,我一直在处理无法进行交叉对齐复制的系统,所以我刚刚开始学习假设存在正常和“快速”正常复制。不幸的是,这是在一个库中,我无法控制此库的用户如何对齐他们发送给我的缓冲区。根据OP在
for
循环之外定义的
I
,我担心他没有C99或
intptr\u t
。即使没有C99,我所见过的每一个类似unix的系统中都有
intptr\u t
inttypes.h
很久以来……我不认为这是个问题。还有+1感谢Simon,感谢他以最佳方式解决问题,即使在“允许”的拱门上未对齐的访问。但是将
对齐
设置为变量而不是常量可能不是一个好主意。+1,但我会使用
uintptr\u t
,有符号值的模值得怀疑,对于SIMD指令,一个好的编译器应该自己解决这个问题,例如,如果你将
-march=native
设置给gcc,我认为它会这样做,就像只要你给他足够大的整数类型来处理。