Graph 一种快速计算推力cuda中阵列的方法

Graph 一种快速计算推力cuda中阵列的方法,graph,cuda,thrust,Graph,Cuda,Thrust,我有四个阵列: thrust::device_vector<int> vertices; thrust::device_vector<int> adjacency: thrust::device_vector<int> degree_summation; thrust::device_vector<int> degree; 我想要的是: degree_summation = {6,6,6} 目前,我正在使用for循环来计算这个值,但我认为也许有

我有四个阵列:

thrust::device_vector<int> vertices;
thrust::device_vector<int> adjacency:
thrust::device_vector<int> degree_summation;
thrust::device_vector<int> degree;
我想要的是:

degree_summation = {6,6,6}
目前,我正在使用for循环来计算这个值,但我认为也许有一种更快的方法可以使用推力给出的原语来计算这些值。我用于计算度总和数组的内核:

__global__ void computeDegreeSummation(int* vertices,int* adjacency,unsigned int* degree_summation, unsigned int * degree,unsigned int num_vertices, unsigned int num_edges){
unsigned long int tid = (gridDim.y*blockIdx.y+blockIdx.x)*blockDim.x+threadIdx.x;
if(tid < num_vertices){
    int pos_first_neighbor = vertices[tid];
    int pos_last_neighbor;
    if(tid != num_vertices - 1){
        int it = tid;
        pos_last_neighbor = vertices[it+1];
        while(pos_last_neighbor == 0){
            it++;
            pos_last_neighbor = vertices[it+1];
        }
        pos_last_neighbor--;
    }//if
    else{
        pos_last_neighbor = num_edges - 1;
    }//else
    for(int nid = pos_first_neighbor; nid <= pos_last_neighbor; nid++){
        if(adjacency[nid]!=tid){
            degree_summation[tid]+=degrees[adjacency[nid]];
        }//if
    }//for
}//if
}//kernel
\uuuuu全局\uuuuu无效计算的出口汇总(int*顶点、int*邻接、无符号int*度之和、无符号int*度、无符号int num顶点、无符号int num边){
无符号长整数tid=(gridDim.y*blockIdx.y+blockIdx.x)*blockDim.x+threadIdx.x;
如果(tid对于(int-nid=pos\u first\u neighbor;nid我不知道这是否会更快,因为我不知道参考实现的运行速度。但这是一种可能的方法:

  • 创建一个向量,定义每个顶点的邻接中的邻域

    vertices =     {0,  2,  4}
    adjacency =    {1,2,0,2,0,1}
    neighborhood = {1,0,1,0,1,0}  (after scatter)
    neighborhood = {1,1,2,2,3,3}  (after inclusive scan)
    
    为了实现这一点,我们将根据
    顶点提供的索引将一组1()放入
    邻域
    数组中,然后在
    邻域
    上执行一次操作

  • 然后,我们可以使用
    邻域
    数组作为键,并使用a从
    中选择对应于
    邻接
    的每个
    邻域
    中列出的每个顶点的值

  • 然后我们只需要将
    添加到步骤2中生成的中间
    度_和
    结果中。我们可以使用它

  • 下面是一个充分发挥作用的示例:

    $ cat t612.cu
    #include <thrust/device_vector.h>
    #include <thrust/iterator/constant_iterator.h>
    #include <thrust/scatter.h>
    #include <thrust/scan.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <thrust/reduce.h>
    #include <thrust/iterator/discard_iterator.h>
    #include <thrust/transform.h>
    #include <thrust/functional.h>
    #include <thrust/copy.h>
    #include <iostream>
    
    
    int main(){
    
      // input data setup
    
      int h_vertices[] = {0,2,4};
      int h_adjacency[] = {1,2,0,2,0,1};
      int h_degree[] = {2,2,2};
      int vertices_size = sizeof(h_vertices)/sizeof(int);
      int adjacency_size = sizeof(h_adjacency)/sizeof(int);
      int degree_size = sizeof(h_degree)/sizeof(int);
      thrust::device_vector<int> vertices(h_vertices, h_vertices+vertices_size);
      thrust::device_vector<int> adjacency(h_adjacency, h_adjacency+adjacency_size);
      thrust::device_vector<int> degree(h_degree, h_degree+degree_size);
      thrust::device_vector<int> degree_summation(degree_size);
    
      // create neighborhood array
    
      thrust::device_vector<int> neighborhood(adjacency_size);
      thrust::constant_iterator<int> first(1);
      thrust::scatter(first, first+adjacency_size, vertices.begin(), neighborhood.begin());
      thrust::inclusive_scan(neighborhood.begin(), neighborhood.end(), neighborhood.begin());
    
      // sum degree over the neighborhoods
    
      thrust::reduce_by_key(neighborhood.begin(), neighborhood.end(), thrust::make_permutation_iterator(degree.begin(), adjacency.begin()), thrust::make_discard_iterator(), degree_summation.begin());
    
      // add the vertex degrees
    
      thrust::transform(degree.begin(), degree.end(), degree_summation.begin(), degree_summation.begin(), thrust::plus<int>());
    
      // display results
    
      thrust::copy(degree_summation.begin(), degree_summation.end(), std::ostream_iterator<int>(std::cout, ","));
      std::cout << std::endl;
      return 0;
    }
    $ nvcc -arch=sm_35 -o t612 t612.cu
    $ ./t612
    6,6,6,
    $
    
    $cat t612.cu
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    int main(){
    //输入数据设置
    int h_顶点[]={0,2,4};
    int h_邻接[]={1,2,0,2,0,1};
    int h_度[]={2,2,2};
    int顶点大小=sizeof(h_顶点)/sizeof(int);
    int邻接_size=sizeof(h_邻接)/sizeof(int);
    int degree_size=sizeof(h_degree)/sizeof(int);
    推力::设备_向量顶点(h_顶点,h_顶点+顶点_大小);
    推力:设备向量邻接(h_邻接,h_邻接+邻接大小);
    推力:装置的矢量度(h度,h度+度大小);
    推力:装置矢量度数之和(度数大小);
    //创建邻域数组
    推力:设备向量邻域(邻接大小);
    推力:常量迭代器优先(1);
    推力::散布(第一,第一+邻接大小,顶点.begin(),邻域.begin());
    inclusive_扫描(neighbourt.begin(),neighbourt.end(),neighbourt.begin());
    //邻域上的和度
    推力::按键(neighbory.begin()、neighbory.end()、推力::生成置换迭代器(degree.begin()、adjacenty.begin()),推力::生成丢弃迭代器()、度求和迭代器(degree.begin());
    //添加顶点角度
    推力::变换(degree.begin()、degree.end()、degree_summation.begin()、degree_summation.begin()、推力::plus());
    //显示结果
    推力::复制(degree_summation.begin()、degree_summation.end()、std::ostream_迭代器(std::cout,“,”);
    
    std::难道我不清楚为什么内核中有
    while
    循环。显然,您的
    顶点
    数组中可能有虚假的零?如果您有一个大数据集来尝试,将内核与推力进行比较可能会很有趣。但是,您的内核看起来并不糟糕,我可以想象它比推力更快可能有很多种情况。相对性能取决于数据。当邻域“长”时,内核方法的性能会较差,特别是当存在长邻域和短邻域的混合时。对于短邻域,它可能比我的推力代码快。存在长邻域和短邻域的混合。因此,我猜它降低了性能。我将创建另一个存储邻域结尾的数组,我想它是否会以某种方式帮助我。如果有如果可以按邻域大小对顶点排序,那么内核的性能可能会更好。我可以尝试使用稳定的按键排序(degree.begin(),degree.end(),vertices.begin())您还需要在
    邻接
    中移动相应的邻域。
    顶点
    仍然需要按升序排列,因此我认为这将有效地涉及“重新编号/重新排序”你的顶点。我还没有完全考虑过。我试图创建一个测试数据集,这样我就可以尝试将你的内核与推力代码进行比较,但创建一个合成数据集似乎很困难。如果你能提出一个算法来实现这一点,我会再做一些尝试。
    $ cat t612.cu
    #include <thrust/device_vector.h>
    #include <thrust/iterator/constant_iterator.h>
    #include <thrust/scatter.h>
    #include <thrust/scan.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <thrust/reduce.h>
    #include <thrust/iterator/discard_iterator.h>
    #include <thrust/transform.h>
    #include <thrust/functional.h>
    #include <thrust/copy.h>
    #include <iostream>
    
    
    int main(){
    
      // input data setup
    
      int h_vertices[] = {0,2,4};
      int h_adjacency[] = {1,2,0,2,0,1};
      int h_degree[] = {2,2,2};
      int vertices_size = sizeof(h_vertices)/sizeof(int);
      int adjacency_size = sizeof(h_adjacency)/sizeof(int);
      int degree_size = sizeof(h_degree)/sizeof(int);
      thrust::device_vector<int> vertices(h_vertices, h_vertices+vertices_size);
      thrust::device_vector<int> adjacency(h_adjacency, h_adjacency+adjacency_size);
      thrust::device_vector<int> degree(h_degree, h_degree+degree_size);
      thrust::device_vector<int> degree_summation(degree_size);
    
      // create neighborhood array
    
      thrust::device_vector<int> neighborhood(adjacency_size);
      thrust::constant_iterator<int> first(1);
      thrust::scatter(first, first+adjacency_size, vertices.begin(), neighborhood.begin());
      thrust::inclusive_scan(neighborhood.begin(), neighborhood.end(), neighborhood.begin());
    
      // sum degree over the neighborhoods
    
      thrust::reduce_by_key(neighborhood.begin(), neighborhood.end(), thrust::make_permutation_iterator(degree.begin(), adjacency.begin()), thrust::make_discard_iterator(), degree_summation.begin());
    
      // add the vertex degrees
    
      thrust::transform(degree.begin(), degree.end(), degree_summation.begin(), degree_summation.begin(), thrust::plus<int>());
    
      // display results
    
      thrust::copy(degree_summation.begin(), degree_summation.end(), std::ostream_iterator<int>(std::cout, ","));
      std::cout << std::endl;
      return 0;
    }
    $ nvcc -arch=sm_35 -o t612 t612.cu
    $ ./t612
    6,6,6,
    $