C++ 推力矢量距离计算_C++_Thrust

C++ 推力矢量距离计算

c++

C++ 推力矢量距离计算,c++,thrust,C++,Thrust,考虑以下数据集和质心。共有7个个体和两个平均值，每个个体有8个维度。它们是按主要顺序存储的 short dim = 8; float centroids[] = { 0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612, 0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441 }; float data[] = { 0.314, 0.504, 0.030,

考虑以下数据集和质心。共有7个个体和两个平均值，每个个体有8个维度。它们是按主要顺序存储的

short dim = 8;
float centroids[] = {
    0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612, 
    0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
};   
float data[] = {
    0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
    0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744, 
    0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869, 
    0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769, 
    0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719, 
    0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530, 
    0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114
};

我想计算每个欧几里德距离。c1-d1，c1-d2。。。。在CPU上，我将执行以下操作：

float dist = 0.0, dist_sqrt;
for(int i = 0; i < 2; i++)
    for(int j = 0; j < 7; j++)
    { 
        float dist_sum = 0.0;
        for(int k = 0; k < dim; k++)
        {
            dist = centroids[i * dim + k] - data[j * dim + k];
            dist_sum += dist * dist;
        }
        dist_sqrt = sqrt(dist_sum);
        // do something with the distance
        std::cout << dist_sqrt << std::endl;

    }

float dist=0.0，dist_sqrt；
对于（int i=0；i<2；i++）
对于（int j=0；j<7；j++）
{ 
浮点数和=0.0；
对于（int k=0；kstd：：cout它可以直接完成。解释如何进行将相当复杂，并且代码相当密集
首先的关键观察是，核心运算可以通过变换的约化来完成。推力变换运算用于执行矢量的元素减法（单个质心）对每个结果进行平方运算，并将结果相加以生成欧几里德距离的平方。此操作的起点是推力：：按_键减少
，但它需要将数据正确地呈现给按_键减少

最终结果是通过取上面每个结果的平方根生成的，我们可以使用普通的推力：：变换
以上是完成所有工作的仅有2行推力代码的摘要说明。然而，第一行代码相当复杂。为了利用并行性，我采取的方法是虚拟“布局”按顺序提供给reduce\u by\u key
的必要向量。举个简单的例子，假设我们有2个质心和4个个体，假设我们的维数为2
centroid 0: C00 C01
centroid 1: C10 C11
individ  0: I00 I01
individ  1: I10 I11
individ  2: I20 I21
individ  3: I30 I31

我们可以这样“布置”向量：
 C00 C01 C00 C01 C00 C01 C00 C01 C10 C11 C10 C11 C10 C11 C10 C11
 I00 I01 I10 I11 I20 I21 I30 I31 I00 I01 I10 I11 I20 I21 I30 I31

为了便于按_键减少
，我们还需要生成键值来描绘向量：
   0   0   1   1   0   0   1   1   0   0   1   1   0   0   1   1

上述数据“布局”数据集可能相当大，我们不希望产生存储和检索成本，因此我们将“动态”生成这些数据集使用推力的集合。这是事情变得相当密集的地方。考虑到上述策略，我们将使用来完成这项工作。我们将创建一个自定义函子，提供给变换迭代器来执行减法（和平方运算）将压缩在一起的I
和C
向量。向量的“布局”将使用置换迭代器和其他自定义索引创建函数动态创建，以帮助在I
和C
中复制模式
因此，从“由内而外”开始，步骤顺序如下：
对于I
（data
）和C
（centr
），使用计数迭代器
与变换迭代器
内的自定义索引函子组合，以生成我们需要的索引序列
使用在步骤1中创建的索引序列和基本I
和C
向量，通过置换迭代器
虚拟地“布局”向量（每个布局向量一个）
将2个“布局”虚拟I
和C
向量压缩在一起，以创建一个
元组向量（虚拟）
使用步骤3中的zip_迭代器
，在transform_迭代器

使用另一个transform\u迭代器
，将计数迭代器
与自定义密钥生成函子组合，生成密钥序列（虚拟）
将第4步和第5步中的迭代器传递到reduce\u by\u key
作为要减少的输入（键、值）。reduce\u by\u key
的输出向量也是键和值。我们不需要这些键，所以我们将使用discard\u迭代器来转储这些值。我们将保存这些值

以上步骤都是在一行推力代码中完成的
以下是说明上述情况的代码：
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/copy.h>
#include <math.h>

#include <time.h>
#include <sys/time.h>
#include <stdlib.h>

#define MAX_DATA 100000000
#define MAX_CENT 5000
#define TOL 0.001

unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
  timeval tv1;
  gettimeofday(&tv1,0);
  return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}

unsigned verify(float *d1, float *d2, int len){
  unsigned pass = 1;
  for (int i = 0; i < len; i++)
    if (fabsf(d1[i] - d2[i]) > TOL){
      std::cout << "mismatch at:  " << i << " val1: " << d1[i] << " val2: " << d2[i] << std::endl;
      pass = 0;
      break;}
  return pass;
}
void eucl_dist_cpu(const float *centroids, const float *data, float *rdist, int num_centroids, int dim, int num_data, int print){

  int out_idx = 0;
  float dist, dist_sqrt;
  for(int i = 0; i < num_centroids; i++)
    for(int j = 0; j < num_data; j++)
    {
        float dist_sum = 0.0;
        for(int k = 0; k < dim; k++)
        {
            dist = centroids[i * dim + k] - data[j * dim + k];
            dist_sum += dist * dist;
        }
        dist_sqrt = sqrt(dist_sum);
        // do something with the distance
        rdist[out_idx++] = dist_sqrt;
        if (print) std::cout << dist_sqrt << ", ";

    }
    if (print) std::cout << std::endl;
}


struct dkeygen : public thrust::unary_function<int, int>
{
  int dim;
  int numd;

  dkeygen(const int _dim, const int _numd) : dim(_dim), numd(_numd) {};

  __host__ __device__ int operator()(const int val) const {
    return (val/dim);
    }
};

typedef thrust::tuple<float, float> mytuple;
struct my_dist : public thrust::unary_function<mytuple, float>
{
  __host__ __device__ float operator()(const mytuple &my_tuple) const {
    float temp = thrust::get<0>(my_tuple) - thrust::get<1>(my_tuple);
    return temp*temp;
  }
};


struct d_idx : public thrust::unary_function<int, int>
{
  int dim;
  int numd;

  d_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};

  __host__ __device__ int operator()(const int val) const {
    return (val % (dim*numd));
    }
};

struct c_idx : public thrust::unary_function<int, int>
{
  int dim;
  int numd;

  c_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};

  __host__ __device__ int operator()(const int val) const {
    return (val % dim) + (dim * (val/(dim*numd)));
    }
};

struct my_sqrt : public thrust::unary_function<float, float>
{
  __host__ __device__ float operator()(const float val) const {
    return sqrtf(val);
  }
};


unsigned long long eucl_dist_thrust(thrust::host_vector<float> &centroids, thrust::host_vector<float> &data, thrust::host_vector<float> &dist, int num_centroids, int dim, int num_data, int print){

  thrust::device_vector<float> d_data = data;
  thrust::device_vector<float> d_centr = centroids;
  thrust::device_vector<float> values_out(num_centroids*num_data);

  unsigned long long compute_time = dtime_usec(0);
  thrust::reduce_by_key(thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), dkeygen(dim, num_data)), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(dim*num_data*num_centroids), dkeygen(dim, num_data)),thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_centr.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), c_idx(dim, num_data))), thrust::make_permutation_iterator(d_data.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), d_idx(dim, num_data))))), my_dist()), thrust::make_discard_iterator(), values_out.begin());
  thrust::transform(values_out.begin(), values_out.end(), values_out.begin(), my_sqrt());
  cudaDeviceSynchronize();
  compute_time = dtime_usec(compute_time);

  if (print){
    thrust::copy(values_out.begin(), values_out.end(), std::ostream_iterator<float>(std::cout, ", "));
    std::cout << std::endl;
    }
  thrust::copy(values_out.begin(), values_out.end(), dist.begin());
  return compute_time;
}


int main(int argc, char *argv[]){

  int dim = 8;
  int num_centroids = 2;
  float centroids[] = {
    0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612,
    0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
  };
  int num_data = 8;
  float data[] = {
    0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
    0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744,
    0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869,
    0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769,
    0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719,
    0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530,
    0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114,
    0.721, 0.555, 0.979, 0.412, 0.007, 0.501, 0.844, 0.234
  };
  std::cout << "cpu results: " << std::endl;
  float dist[num_data*num_centroids];
  eucl_dist_cpu(centroids, data, dist, num_centroids, dim, num_data, 1);

  thrust::host_vector<float> h_data(data, data + (sizeof(data)/sizeof(float)));
  thrust::host_vector<float> h_centr(centroids, centroids + (sizeof(centroids)/sizeof(float)));
  thrust::host_vector<float> h_dist(num_centroids*num_data);

  std::cout << "gpu results: " << std::endl;
  eucl_dist_thrust(h_centr, h_data, h_dist, num_centroids, dim, num_data, 1);

  float *data2, *centroids2, *dist2;
  num_centroids = 10;
  num_data = 1000000;

  if (argc > 2) {
    num_centroids = atoi(argv[1]);
    num_data = atoi(argv[2]);
    if ((num_centroids < 1) || (num_centroids > MAX_CENT)) {std::cout << "Num centroids out of range" << std::endl; return 1;}
    if ((num_data < 1) || (num_data > MAX_DATA)) {std::cout << "Num data out of range" << std::endl; return 1;}
    if (num_data * dim * num_centroids > 2000000000) {std::cout << "data set out of range" << std::endl; return 1;}}
  std::cout << "Num Data: " << num_data << std::endl;
  std::cout << "Num Cent: " << num_centroids << std::endl;
  std::cout << "result size: " << ((num_data*num_centroids*4)/1048576) << " Mbytes" << std::endl;
  data2 = new float[dim*num_data];
  centroids2 = new float[dim*num_centroids];
  dist2 = new float[num_data*num_centroids];
  for (int i = 0; i < dim*num_data; i++) data2[i] = rand()/(float)RAND_MAX;
  for (int i = 0; i < dim*num_centroids; i++) centroids2[i] = rand()/(float)RAND_MAX;
  unsigned long long dtime = dtime_usec(0);
  eucl_dist_cpu(centroids2, data2, dist2, num_centroids, dim, num_data, 0);
  dtime = dtime_usec(dtime);
  std::cout << "cpu time: " << dtime/(float)USECPSEC << "s" << std::endl;
  thrust::host_vector<float> h_data2(data2, data2 + (dim*num_data));
  thrust::host_vector<float> h_centr2(centroids2, centroids2 + (dim*num_centroids));
  thrust::host_vector<float> h_dist2(num_data*num_centroids);
  dtime = dtime_usec(0);
  unsigned long long ctime = eucl_dist_thrust(h_centr2, h_data2, h_dist2, num_centroids, dim, num_data, 0);
  dtime = dtime_usec(dtime);
  std::cout << "gpu total time: " << dtime/(float)USECPSEC << "s, gpu compute time: " << ctime/(float)USECPSEC << "s" << std::endl;
  if (!verify(dist2, &(h_dist2[0]), num_data*num_centroids)) {std::cout << "Verification failure." << std::endl; return 1;}
  std::cout << "Success!" << std::endl;

  return 0;

}

#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义最大数据量100000000
#定义最高5000美分
#定义TOL 0.001
无符号long-long-dtime\u-usec（无符号long-long-prev）{
#定义USECPSEC 10000000ull
timeval-tv1；
gettimeofday（&tv1,0）；
返回（（tv1.tv_sec*USECPSEC）+tv1.tv_usec）-prev；
}
无符号验证（浮点*d1，浮点*d2，整数长度）{
无符号过程=1；
对于（int i=0；iTOL）{
std：：可以用推力来完成吗？我不会说有内置的解决方案；有必要将各种推力概念结合起来。我提出了一种方法，速度不会快很多（2x）要使它运行得更快，可能需要更好地了解您实际的预期数据大小，而且使用CUDA或除推力之外的其他GPU方法也可能更快。如果您愿意，我可以向您展示我所做的一切，但如果您不熟悉推力，则需要对其进行解释概念。GPU可能对（小型）应用程序没有用处你展示的数据大小谢谢罗伯特，如果你能展示给我，我将不胜感激。关于我数据的大小：它只是其中的一部分。我实际上有超过1亿个个体，大约5000个质心。1亿个个体和5000个质心。每个个体的暗度仍然是8吗？是的，每个都是8维。事实上，我担心，它必须是分开，否则它将无法放入GPU的内存中。