C 这是优化堆上多维数组的可能方法吗?

C 这是优化堆上多维数组的可能方法吗?,c,arrays,multidimensional-array,C,Arrays,Multidimensional Array,下面是在堆上分配多维数组的常用方法,方法是使用指向指针的指针 typedef struct ArrayInt { int *array; int length; } ArrayInt; static void ArrayIntCreate(ArrayInt *array, int length) { array->array = MjMalloc(length * sizeof(int)); array->length = length; } st

下面是在堆上分配多维数组的常用方法,方法是使用指向指针的指针

typedef struct ArrayInt {
    int *array;
    int length;
} ArrayInt;

static void ArrayIntCreate(ArrayInt *array, int length) {
    array->array = MjMalloc(length * sizeof(int));
    array->length = length;
}

static void ArrayIntDelete(ArrayInt *array) {
    free(array->array);
}

typedef struct ArrayArrayInt {
    ArrayInt *array;
    int length;
} ArrayArrayInt;

static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
    array->array = MjMalloc(length * sizeof(ArrayInt));
    array->length = length;
    for (int i = 0; i < length; i += 1) {
        ArrayIntCreate(&array->array[i], length2);
    }
}

static void ArrayArrayIntDelete(ArrayArrayInt *array) {
    for (int i = 0; i < array->length; i += 1) {
        ArrayIntDelete(&array->array[i]);
    }
    free(array->array);
}
当运行下面的测试代码时,第二个版本的运行速度大约快20%。原因可能是什么?这是一种普遍适用的优化技术吗?是否有一些库定义此类数组类型以进行优化

在编辑之前,我在测试代码中犯了一个巨大的错误。第一个版本运行得比较慢,因为它的分配和解除分配保持在for循环中,而第二个版本在进入循环之前只执行了一次。请参阅下面测试代码中的注释。在使两个测试相等之后,我发现第一个版本可以运行得更快,尤其是在优化之后。我将更复杂的操作和各种拷贝放入测试代码中,我发现第一个总是运行得更快一些。在我的机器里,索引的乘法运算似乎很慢?不过,我不确定原因

静态双延时(时钟开始时间、时钟结束时间){
返回(双倍)(结束时间-开始时间)/时钟每秒;
}
#定义N 2000
int main(){
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
长整和;
时钟开始时间,结束时间;
开始时间=时钟();
总和=0;
对于(int k=0;k=0){
sum+=aai.array[i].array[j]-i+1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime=clock();
printf(“aai:sum=%lld;time=%.2f\n”,sum,ElapsedTime(startTime,endTime));
开始时间=时钟();
总和=0;
ArrayArrayInt2Create(&aai2,N,N);//这里有错误!!
对于(int k=0;k=0){
sum+=aai2At(aai2,i)[j]-i+1;
}
}
}
ArrayArrayInt2Delete(&aai2);//应该放在循环块内。。
endTime=clock();
printf(“aai2:sum=%lld;time=%.2f\n”,sum,ElapsedTime(startTime,endTime));
返回0;
}

是的,使用算术和单个基指针是编译器在内部为非动态分配的2D(n维)数组执行的操作

您可以获得最大的性能,因为只有一次计算和索引查找。显示2D数组时,每个数组访问有两个指针查找和两个索引计算(一个索引计算和查找用于访问正确的数组,第二个索引计算和查找用于访问正确数组中的元素)。使用3D阵列,将有三个索引计算和三个查找

您还可以分配更少的内存,并且需要更少的内存分配,但这些都是二阶效应

此外,正如a中指出的,但我没有提到,与多个较小的内存块相比,使用一个较大的内存块可以获得更好的引用位置和更智能的预取潜力(这些内存块加起来比单个较大的内存块更多)


我在MacOSX10.10.2Yosemite上测试了这个用GCC4.9.1编译的文件(
sim2d.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

static void *MjMalloc(size_t nbytes)
{
    void *rv = malloc(nbytes);
    if (rv == 0)
    {
        fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
        exit(1);
    }
    return rv;
}

/* Mechanism 1 */
typedef struct ArrayInt {
    int *array;
    int length;
} ArrayInt;

static void ArrayIntCreate(ArrayInt *array, int length) {
    array->array = MjMalloc(length * sizeof(int));
    array->length = length;
}

static void ArrayIntDelete(ArrayInt *array) {
    free(array->array);
}

typedef struct ArrayArrayInt {
    ArrayInt *array;
    int length;
} ArrayArrayInt;

static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
    array->array = MjMalloc(length * sizeof(ArrayInt));
    array->length = length;
    for (int i = 0; i < length; i += 1) {
        ArrayIntCreate(&array->array[i], length2);
    }
}

static void ArrayArrayIntDelete(ArrayArrayInt *array) {
    for (int i = 0; i < array->length; i += 1) {
        ArrayIntDelete(&array->array[i]);
    }
    free(array->array);
}

/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
    int *array;
    int length;
    int length2;
} ArrayArrayInt2;

static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
    array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
    array->length = length;
    array->length2 = length2;
}

static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
    free(array->array);
}

#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])

/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
    return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}

#define N 2000
#define N_CYCLES    1000

static void one_test_cycle(void)
{
    ArrayArrayInt aai;
    ArrayArrayInt2 aai2;
    long long int sum;
    clock_t startTime, endTime;

    startTime = clock();             
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayIntCreate(&aai, N, N);
        for (int i = 0; i < aai.length; i += 1) {
            int j = 0;
            for (; j < aai.array[i].length; j += 1) {
                aai.array[i].array[j] = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai.array[i].array[j] - i + 1;
            }
        }
        ArrayArrayIntDelete(&aai);
    }
    endTime = clock();
    printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));

    startTime = clock();
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayInt2Create(&aai2, N, N);
        for (int i = 0; i < aai2.length; i += 1) {
            int j = 0;
            for (; j < aai2.length2; j += 1) {
                aai2At(aai2, i)[j] = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai2At(aai2, i)[j] - i + 1;
            }
        }
        ArrayArrayInt2Delete(&aai2);
    }
    endTime = clock();
    printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));

    startTime = clock();
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayInt2Create(&aai2, N, N);
        for (int i = 0; i < aai2.length; i += 1) {
            int j = 0;
            for (; j < aai2.length2; j += 1) {
                aai2At2(aai2, i, j) = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai2At2(aai2, i, j) - i + 1;
            }
        }
        ArrayArrayInt2Delete(&aai2);
    }
    endTime = clock();
    printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}

static void print_now(const char *tag)
{
    time_t now = time(0);
    struct tm *lt = localtime(&now);
    char buffer[32];
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
    printf("%s: %s\n", tag, buffer);
}

int main(void)
{
    print_now("Started");
    for (int i = 0; i < 3; i++)
        one_test_cycle();
    print_now("Finished");
    return 0;
}
我在(N_CYCLE=2000)中得到了类似的模式,但运行时间是原来的两倍——惊喜,惊喜

我看到单一分配代码有一个小但明显的好处(大约减少13%),但“aai2”测试的两个计时之间没有显著差异

基本统计:

# All data
# Count    = 9
# Mean     =  6.250000e+00
# Std Dev  =  3.807230e-01

# aai1 only:
# Count    = 3
# Mean     =  6.756667e+00
# Std Dev  =  4.041452e-02

# aai2 and aai3:
# Count    = 6
# Mean     =  5.996667e+00
# Std Dev  =  1.505545e-02

# aai2 only:
# Count    = 3
# Mean     =  6.006667e+00
# Std Dev  =  1.527525e-02

# aai3 only:
# Count    = 3
# Mean     =  5.986667e+00
# Std Dev  =  5.773503e-03
显然,正式地确保机器以其他方式卸载,并运行更多的测试迭代和类似的基准测试步骤可能会改进数据,但单分配
aai2
机制在这台机器上的性能优于多分配
aai
机制。(顺便说一下:当人们有两个或更多版本的代码时,为什么不在他们的第一个版本上加后缀1?)


硬件:17“Mac Book Pro,2011年初,2.3 GHz Intel Core i7,16 GiB 1333 MHz DDR3 RAM。

这是优化2D阵列内存分配性能的常用技术。通过减少对
malloc
的调用次数,您可以获得性能提升。您不需要任何库就可以使用真正的二维数组,而不需要使用最初使用的二维伪数组。自1999年以来,现代C就知道直接处理这个问题。只需执行
int(*arr)[n]=malloc(sizeof(int[m][n])这称为“VLA”,可变长度数组。Upticked,也可以通过单个连续分配,在有序行/板枚举期间,您很可能会获得更智能的预取器性能。每一分钱都有帮助。@JonathanLeffler我在原始帖子的测试代码中犯了一个巨大的错误。现在我在修复后得到了相反的结果。请参阅我的编辑。@xiver77:请参阅其他资料。我发现在我的机器上使用
aai2
机制比使用
aai
机制提高了13%。@JonathanLeffler有趣的结果。在我的机器中进行了许多测试(一些随机的Core i5、Win7、MinGW/GCC),
aai
开始运行,运行速度几乎与带有优化标志
-O2
或更高的
aai2
相同,甚至更快。没有优化,
aai2
总是更快。不确定,但随着优化级别的提高,gcc似乎以一种巧妙的方式优化了指针数组。
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

static void *MjMalloc(size_t nbytes)
{
    void *rv = malloc(nbytes);
    if (rv == 0)
    {
        fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
        exit(1);
    }
    return rv;
}

/* Mechanism 1 */
typedef struct ArrayInt {
    int *array;
    int length;
} ArrayInt;

static void ArrayIntCreate(ArrayInt *array, int length) {
    array->array = MjMalloc(length * sizeof(int));
    array->length = length;
}

static void ArrayIntDelete(ArrayInt *array) {
    free(array->array);
}

typedef struct ArrayArrayInt {
    ArrayInt *array;
    int length;
} ArrayArrayInt;

static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
    array->array = MjMalloc(length * sizeof(ArrayInt));
    array->length = length;
    for (int i = 0; i < length; i += 1) {
        ArrayIntCreate(&array->array[i], length2);
    }
}

static void ArrayArrayIntDelete(ArrayArrayInt *array) {
    for (int i = 0; i < array->length; i += 1) {
        ArrayIntDelete(&array->array[i]);
    }
    free(array->array);
}

/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
    int *array;
    int length;
    int length2;
} ArrayArrayInt2;

static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
    array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
    array->length = length;
    array->length2 = length2;
}

static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
    free(array->array);
}

#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])

/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
    return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}

#define N 2000
#define N_CYCLES    1000

static void one_test_cycle(void)
{
    ArrayArrayInt aai;
    ArrayArrayInt2 aai2;
    long long int sum;
    clock_t startTime, endTime;

    startTime = clock();             
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayIntCreate(&aai, N, N);
        for (int i = 0; i < aai.length; i += 1) {
            int j = 0;
            for (; j < aai.array[i].length; j += 1) {
                aai.array[i].array[j] = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai.array[i].array[j] - i + 1;
            }
        }
        ArrayArrayIntDelete(&aai);
    }
    endTime = clock();
    printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));

    startTime = clock();
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayInt2Create(&aai2, N, N);
        for (int i = 0; i < aai2.length; i += 1) {
            int j = 0;
            for (; j < aai2.length2; j += 1) {
                aai2At(aai2, i)[j] = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai2At(aai2, i)[j] - i + 1;
            }
        }
        ArrayArrayInt2Delete(&aai2);
    }
    endTime = clock();
    printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));

    startTime = clock();
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayInt2Create(&aai2, N, N);
        for (int i = 0; i < aai2.length; i += 1) {
            int j = 0;
            for (; j < aai2.length2; j += 1) {
                aai2At2(aai2, i, j) = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai2At2(aai2, i, j) - i + 1;
            }
        }
        ArrayArrayInt2Delete(&aai2);
    }
    endTime = clock();
    printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}

static void print_now(const char *tag)
{
    time_t now = time(0);
    struct tm *lt = localtime(&now);
    char buffer[32];
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
    printf("%s: %s\n", tag, buffer);
}

int main(void)
{
    print_now("Started");
    for (int i = 0; i < 3; i++)
        one_test_cycle();
    print_now("Finished");
    return 0;
}
Started: 2015-04-07 07:40:41
aai1: sum = 4000000000; time = 6.80
aai2: sum = 4000000000; time = 5.99
aai3: sum = 4000000000; time = 5.98
aai1: sum = 4000000000; time = 6.75
aai2: sum = 4000000000; time = 6.02
aai3: sum = 4000000000; time = 5.99
aai1: sum = 4000000000; time = 6.72
aai2: sum = 4000000000; time = 6.01
aai3: sum = 4000000000; time = 5.99
Finished: 2015-04-07 07:41:38
# All data
# Count    = 9
# Mean     =  6.250000e+00
# Std Dev  =  3.807230e-01

# aai1 only:
# Count    = 3
# Mean     =  6.756667e+00
# Std Dev  =  4.041452e-02

# aai2 and aai3:
# Count    = 6
# Mean     =  5.996667e+00
# Std Dev  =  1.505545e-02

# aai2 only:
# Count    = 3
# Mean     =  6.006667e+00
# Std Dev  =  1.527525e-02

# aai3 only:
# Count    = 3
# Mean     =  5.986667e+00
# Std Dev  =  5.773503e-03