CUDA中奇偶数的分离

CUDA中奇偶数的分离,cuda,Cuda,我有一个数字数组,{1,2,3,4,5,6,7,8,9,10},我想把偶数和奇数分开,如下所示: even = {2,4,6,8} 以及: 我知道CUDA中的原子操作,也知道输出预计不会受到竞争条件的影响。我不想使用原子操作。我如何在不使用原子关键字的情况下实现这一点 代码: #include <stdio.h> #include <cuda.h> // Kernel that executes on the CUDA device __global__ void s

我有一个数字数组,
{1,2,3,4,5,6,7,8,9,10}
,我想把偶数和奇数分开,如下所示:

even = {2,4,6,8}
以及:

我知道CUDA中的原子操作,也知道输出预计不会受到竞争条件的影响。我不想使用原子操作。我如何在不使用原子关键字的情况下实现这一点

代码:

#include <stdio.h>
#include <cuda.h>

// Kernel that executes on the CUDA device
__global__ void square_array(float *total,float *even,float *odd, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int a=total[idx];
  if ((a%2)==0) 
  {  
    for (int i=0;i<=idx;i++)
    {   
        int b = even[i];
        if(b==0)
        {
            even[i] = total[idx];
            break;

        }
    }
  }
  else
        {  
    for (int i=0;i<idx;i++)
    {   
        int c = odd[i];

            odd[i] = total[idx];
            break;
    }
  }
}

// main routine that executes on the host
int main(void)
{
  float *total_h,*even_h, *odd_h,*total_d, *even_d,*odd_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);


  total_h = (float *)malloc(size); // Allocate array on host
  even_h = (float *)malloc(size); // Allocate array on host
  odd_h = (float *)malloc(size); // Allocate array on host

  cudaMalloc((void **) &total_d, size);
  cudaMalloc((void **) &even_d, size);
  cudaMemset(even_d,0,size);        
  cudaMalloc((void **) &odd_d, size);   // Allocate array on device
  cudaMemset(odd_d,0,size);


  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) total_h[i] = (float)i+1;
  cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:

  square_array <<< 1,10 >>> (total_d,even_d,odd_d, N);
  // Retrieve result from device and store it in host array

  cudaMemcpy(even_h, even_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  cudaMemcpy(odd_h, odd_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

  // Print results
    printf("total Numbers\n");
  for (int i=0; i<N; i++) printf("%f\n",total_h[i]);

  printf("EVEN Numbers\n");
  for (int i=0; i<N; i++) printf("%f\n",even_h[i]);

  printf("ODD Numbers\n");
  for (int i=0; i<N; i++) printf("%f\n",odd_h[i]);
  // Cleanup
  free(total_h);
  free(even_h);
  free(odd_h);


  cudaFree(total_d);
  cudaFree(even_d);
  cudaFree(odd_d);
}
#包括
#包括
//在CUDA设备上执行的内核
__全局无效平方数组(浮点*总计、浮点*偶数、浮点*奇数、整数N)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
int a=总[idx];
如果((a%2)=0)
{  

对于Jared Hoberock建议的(int i=0;i),使用CUDA推力中可用的高效分区算法比开始开发自己的分区例程要容易得多。下面,请找到一个完整的工作示例

#include <thrust\device_vector.h>
#include <thrust\partition.h>
#include <thrust\execution_policy.h>

struct is_even { __host__ __device__ bool operator()(const int &x) { return (x % 2) == 0; } };

void main() {

    const int N = 10;

    thrust::host_vector<int> h_data(N);
    for (int i=0; i<N; i++) h_data[i] = i;

    thrust::device_vector<int> d_data(h_data);
    thrust::device_vector<int> d_evens(N/2);
    thrust::device_vector<int> d_odds(N/2);

    thrust::partition_copy(d_data.begin(), d_data.end(), d_evens.begin(), d_odds.begin(), is_even());

    printf("Even numbers\n");
    for (int i=0; i<N/2; i++) {
        int val = d_evens[i];
        printf("evens[%i] = %i\n",i,val);
    }

    printf("Odd numbers\n");
    for (int i=0; i<N/2; i++) {
        int val = d_odds[i];
        printf("odds[%i] = %i\n",i,val);
    }

}
#包括
#包括
#包括
结构是偶数{{uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu;
void main(){
常数int N=10;
推力::主机向量h_数据(N);

对于(inti=0;i使用
asch::partition
asch::partition\u copy
#include <thrust\device_vector.h>
#include <thrust\partition.h>
#include <thrust\execution_policy.h>

struct is_even { __host__ __device__ bool operator()(const int &x) { return (x % 2) == 0; } };

void main() {

    const int N = 10;

    thrust::host_vector<int> h_data(N);
    for (int i=0; i<N; i++) h_data[i] = i;

    thrust::device_vector<int> d_data(h_data);
    thrust::device_vector<int> d_evens(N/2);
    thrust::device_vector<int> d_odds(N/2);

    thrust::partition_copy(d_data.begin(), d_data.end(), d_evens.begin(), d_odds.begin(), is_even());

    printf("Even numbers\n");
    for (int i=0; i<N/2; i++) {
        int val = d_evens[i];
        printf("evens[%i] = %i\n",i,val);
    }

    printf("Odd numbers\n");
    for (int i=0; i<N/2; i++) {
        int val = d_odds[i];
        printf("odds[%i] = %i\n",i,val);
    }

}