C 为需要长对齐的消息传递对齐8位整数数组

C 为需要长对齐的消息传递对齐8位整数数组,c,arrays,memory-alignment,message-passing,numa,C,Arrays,Memory Alignment,Message Passing,Numa,我目前正在NUMA架构的处理器上使用OpenMP,并使用主板提供的用户动态网络(UDN)将数据包从每个线程路由到主线程。该程序的总体思想很简单:将两个8位整数数组的减法按元素并行化,并将结果存储到一个数组(C[i]=A[i]-B[i]) 由于我使用的处理器的NUMA体系结构,分配阵列,然后放置简单的OpenMP for loop pragma会导致非常差的性能。我获得更好性能的方法之一是将大小相同的数组作为私有数组分配给每个线程,让每个线程只处理该线程分配的私有数据。其思想是让每个线程在本地工作

我目前正在NUMA架构的处理器上使用OpenMP,并使用主板提供的用户动态网络(UDN)将数据包从每个线程路由到主线程。该程序的总体思想很简单:将两个8位整数数组的减法按元素并行化,并将结果存储到一个数组(C[i]=A[i]-B[i])

由于我使用的处理器的NUMA体系结构,分配阵列,然后放置简单的OpenMP for loop pragma会导致非常差的性能。我获得更好性能的方法之一是将大小相同的数组作为私有数组分配给每个线程,让每个线程只处理该线程分配的私有数据。其思想是让每个线程在本地工作,以避免从其他内核获取数据的开销

虽然这会提高性能,但我正在尝试将每个线程中的数据累积到最后一个线程中,主程序可以在并行区域之外使用该线程。但是,当使用UDN传递消息时,数据表中的一个要求是发送的缓冲区和接收数据的缓冲区必须长对齐

以下是我的代码:

int size = 6000;
int threads = 20;

if(tmc_udn_init(NULL) != 0)
{
  printf("tmc_udn_init failure\n");
}

alloc_attr_t attrCfinal = ALLOC_INIT;
mspace mspCfinal = create_mspace_with_attr(0,0,&attrCfinal);
uint8_t *C_final = mspace_malloc(mspCfinal, sizeof(uint8_t)*(size*size));

#pragma omp parallel num_threads(threads)
{
  int j;

  int id = omp_get_thread_num();
  if(tmc_cpus_set_my_cpu(id) < 0)
  {
    printf("tmc_cpus_set_my_cpu failure, thread = %d\n", id);
  }

  if(tmc_udn_activate() < 0)
  {
    printf("tmc_udn_activate() failure, threads = %d\n", id);
  }

  #pragma omp barrier

  alloc_attr_t attrA = ALLOC_INIT;
  alloc_attr_t attrB = ALLOC_INIT;
  alloc_attr_t attrC = ALLOC_INIT;

  alloc_set_home(&attrA, ALLOC_HOME_HERE);
  alloc_set_home(&attrB, ALLOC_HOME_HERE);
  alloc_set_home(&attrC, ALLOC_HOME_HERE);

  mspace mspA = create_mspace_with_attr(0,0,&attrA);
  mspace mspB = create_mspace_with_attr(0,0,&attrB);
  mspace mspC = create_mspace_with_attr(0,0,&attrC);

  uint8_t *A_priv = mspace_malloc(mspA, sizeof(uint8_t)*((size*size)/threads));
  uint8_t *B_priv = mspace_malloc(mspB, sizeof(uint8_t)*((size*size)/threads));
  uint8_t *C_priv = mspace_malloc(mspC, sizeof(uint8_t)*((size*size)/threads));

  for(j=0; j<((size*size)/threads); j++)
  {
    A_priv[j] = 100;
    B_priv[j] = omp_get_thread_num();
    C_priv[j] = 0;
  }

  for(j=0; j<((size*size)/threads); j++)
  {
    C_priv[j] = A_priv[j] - B_priv[j];
  }

  if(omp_get_thread_num() == 0)
  {
    int k = 0;
    int h = 0;

    for(k=1; k<threads; k++)
    {
      int thread_num = tmc_udn1_receive();

      for(h=0; h<((size*size)/threads)/100; h++)
      {
        tmc_udn0_receive_buffer(&C_final[((thread_num-1)*((size*size)/threads))+(h*100)], 100);
      }
    }
  }
  else
  {
    #pragma omp critical
    {
      DynamicHeader dest = tmc_udn_header_from_cpu(0);
      tmc_udn_send_1(dest, UDN1_DEMUX_TAG, omp_get_thread_num());

      int p = 0;
      for(p=0; p<((size*size)/threads)/100; p++)
      {
        tmc_udn_send_buffer(dest, UDN0_DEMUX_TAG, &C_priv[p*100], 100);
      }
    }
  }
}
然而,我得到的结果是:

C_final[0] = 99
C_final[1875000] = -16
C_final[3750000] = 97
C_final[5625000] = 96
C_final[7500000] = 95
C_final[9375000] = -16
C_final[11250000] = -16
C_final[13125000] = 92
C_final[15000000] = -16
C_final[16875000] = 90
C_final[18750000] = -16
C_final[20625000] = 88
C_final[22500000] = -16
C_final[24375000] = -16
C_final[26250000] = 85
C_final[28125000] = -16
C_final[30000000] = -16
C_final[31875000] = 82
C_final[33750000] = -16
C_final[35625000] = -16
C_final[37500000] = -16
C_final[39375000] = 78
C_final[41250000] = -16
C_final[43125000] = 76
C_final[45000000] = -16
C_final[46875000] = -16
C_final[48750000] = 73
C_final[50625000] = 72
C_final[52500000] = -16
C_final[54375000] = -16

至于哪些结果是准确的,哪些结果不是,每次我运行它时都会发生变化。并非总是相同的索引具有正确的结果。可以看到的错误数字也不一致

您是否在编译时启用了所有警告,我认为这可能是指针对齐错误,也可能是严重的别名冲突一个可能的修复方法是在发送之前很久将其设置为
memcpy
。我使用-Wall编译,没有收到任何与此相关的警告。我最近注意到,如果从一个线程接收到一个结果块,并且将它们放在C_priv中的某个位置,该位置位于已经接收到的另一个结果块之前,那么结果将变得混乱。例如,如果我收到了线程3的结果,在线程2的结果出现并被放在它前面的索引中之前,它们都是正常的。我设法确保线程按线程顺序发送结果,现在我的结果是准确的。
C_final[0] = 99
C_final[1875000] = 98
C_final[3750000] = 97
C_final[5625000] = 96
C_final[7500000] = 95
C_final[9375000] = 94
C_final[11250000] = 93
C_final[13125000] = 92
C_final[15000000] = 91
C_final[16875000] = 90
C_final[18750000] = 89
C_final[20625000] = 88
C_final[22500000] = 87
C_final[24375000] = 86
C_final[26250000] = 85
C_final[28125000] = 84
C_final[30000000] = 83
C_final[31875000] = 82
C_final[33750000] = 81
C_final[35625000] = 80
C_final[37500000] = 79
C_final[39375000] = 78
C_final[41250000] = 77
C_final[43125000] = 76
C_final[45000000] = 75
C_final[46875000] = 74
C_final[48750000] = 73
C_final[50625000] = 72
C_final[52500000] = 71
C_final[54375000] = -1
C_final[0] = 99
C_final[1875000] = -16
C_final[3750000] = 97
C_final[5625000] = 96
C_final[7500000] = 95
C_final[9375000] = -16
C_final[11250000] = -16
C_final[13125000] = 92
C_final[15000000] = -16
C_final[16875000] = 90
C_final[18750000] = -16
C_final[20625000] = 88
C_final[22500000] = -16
C_final[24375000] = -16
C_final[26250000] = 85
C_final[28125000] = -16
C_final[30000000] = -16
C_final[31875000] = 82
C_final[33750000] = -16
C_final[35625000] = -16
C_final[37500000] = -16
C_final[39375000] = 78
C_final[41250000] = -16
C_final[43125000] = 76
C_final[45000000] = -16
C_final[46875000] = -16
C_final[48750000] = 73
C_final[50625000] = 72
C_final[52500000] = -16
C_final[54375000] = -16