C 是否有标准宏来检测需要对齐内存访问的体系结构?
假设如下:C 是否有标准宏来检测需要对齐内存访问的体系结构?,c,c-preprocessor,memory-alignment,C,C Preprocessor,Memory Alignment,假设如下: void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len) { unsigned int i; for(i=0; i<len; i++) { dest[i] = src[i] & mask[i]; } } void mask_bytes(unsigned char* dest, unsigned char* sr
void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
unsigned int i;
for(i=0; i<len; i++)
{
dest[i] = src[i] & mask[i];
}
}
void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
unsigned int i;
unsigned int wordlen = len >> 2;
for(i=0; i<wordlen; i++)
{
((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i]; // this raises SIGBUS on SPARC and other archs that require aligned access.
}
for(i=wordlen<<2; i<len; i++){
dest[i] = src[i] & mask[i];
}
}
void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
unsigned int i;
unsigned int wordlen = len >> 2;
#if defined(__ALIGNED2__) || defined(__ALIGNED4__) || defined(__ALIGNED8__)
// go slow
for(i=0; i<len; i++)
{
dest[i] = src[i] & mask[i];
}
#else
// go fast
for(i=0; i<wordlen; i++)
{
// the following line will raise SIGBUS on SPARC and other archs that require aligned access.
((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i];
}
for(i=wordlen<<2; i<len; i++){
dest[i] = src[i] & mask[i];
}
#endif
}
void mask_字节(unsigned char*dest、unsigned char*src、unsigned char*mask、unsigned int len)
{
无符号整数i;
对于(i=0;i>2;
对于(i=0;i而言,当x86以静默方式修复未对齐的访问时,这对性能来说并不是最优的。通常最好假设某种对齐方式并自己执行修复:
unsigned int const alignment = 8; /* or 16, or sizeof(long) */
void memcpy(char *dst, char const *src, unsigned int size) {
if((((intptr_t)dst) % alignment) != (((intptr_t)src) % alignment)) {
/* no common alignment, copy as bytes or shift around */
} else {
if(((intptr_t)dst) % alignment) {
/* copy bytes at the beginning */
}
/* copy words in the middle */
if(((intptr_t)dst + size) % alignment) {
/* copy bytes at the end */
}
}
}
另外,请查看SIMD说明。标准方法是使用配置脚本来运行程序以测试对齐问题。如果测试程序没有崩溃,则配置脚本会在生成的配置标头中定义一个宏,以便更快地实现。更安全的实现是def奥尔特
void mask_bytes(unsigned char* dest, unsigned char* src, unsigned char* mask, unsigned int len)
{
unsigned int i;
unsigned int wordlen = len >> 2;
#if defined(UNALIGNED)
// go fast
for(i=0; i<wordlen; i++)
{
// the following line will raise SIGBUS on SPARC and other archs that require aligned access.
((uint32_t*)dest)[i] = ((uint32_t*)src)[i] & ((uint32_t*)mask)[i];
}
for(i=wordlen<<2; i<len; i++){
dest[i] = src[i] & mask[i];
}
#else
// go slow
for(i=0; i<len; i++)
{
dest[i] = src[i] & mask[i];
}
#endif
}
void mask_字节(unsigned char*dest、unsigned char*src、unsigned char*mask、unsigned int len)
{
无符号整数i;
无符号整数字len=len>>2;
#如果已定义(未对齐)
//快走
对于(i=0;i(我觉得当这些通勤时,你有src
和mask
很奇怪。我把mask\u bytes
重命名为memand
。但无论如何…)
另一种选择是使用利用C中类型的不同函数。例如:
void memand_bytes(char *dest, char *src1, char *src2, size_t len)
{
unsigned int i;
for (i = 0; i < len; i++)
dest[i] = src1[i] & src2[i];
}
void memand_ints(int *dest, int *src1, int *src2, size_t len)
{
unsigned int i;
for (i = 0; i < len; i++)
dest[i] = src1[i] & src2[i];
}
void memand\u字节(char*dest,char*src1,char*src2,size\t len)
{
无符号整数i;
对于(i=0;i
这样你就可以让程序员来决定了。CPU会进行额外的循环,以获取未对齐的数据并将其移动到正确的位置。通常这会比对齐的速度慢很多。你应该总是尝试读取对齐的数据……就像我说的,我一直在处理无法进行交叉对齐复制的系统,所以我刚刚开始学习假设存在正常和“快速”正常复制。不幸的是,这是在一个库中,我无法控制此库的用户如何对齐他们发送给我的缓冲区。根据OP在for
循环之外定义的I
,我担心他没有C99或intptr\u t
。即使没有C99,我所见过的每一个类似unix的系统中都有intptr\u t
inttypes.h
很久以来……我不认为这是个问题。还有+1感谢Simon,感谢他以最佳方式解决问题,即使在“允许”的拱门上未对齐的访问。但是将对齐
设置为变量而不是常量可能不是一个好主意。+1,但我会使用uintptr\u t
,有符号值的模值得怀疑,对于SIMD指令,一个好的编译器应该自己解决这个问题,例如,如果你将-march=native
设置给gcc,我认为它会这样做,就像只要你给他足够大的整数类型来处理。