Graph 迪克斯特拉'；CUDA中的s算法_Graph_Parallel Processing_Cuda_Dijkstra

Graph 迪克斯特拉'；CUDA中的s算法

graph parallel-processing cuda

Graph 迪克斯特拉'；CUDA中的s算法,graph,parallel-processing,cuda,dijkstra,Graph,Parallel Processing,Cuda,Dijkstra,我写的这段CUDA代码有问题。这应该是CUDA的实现。代码如下： __global__ void cuda_dijkstra_kernel_1(float* Va, int* Ea, int* Sa, float* Ca, float* Ua, char* Ma, unsigned int* lock){ int tid = blockIdx.x; if(Ma[tid]=='1'){ Ma[tid] = '0';

我写的这段CUDA代码有问题。这应该是CUDA的实现。代码如下：

    __global__ void cuda_dijkstra_kernel_1(float* Va, int* Ea, int* Sa, float* Ca, float* Ua, char* Ma, unsigned int* lock){

        int tid = blockIdx.x;
        if(Ma[tid]=='1'){
            Ma[tid] = '0';
            int ind_Ea = Sa[tid * 2];
            int num_edges = Sa[(tid * 2) + 1];
            int v;
            float wt = 0;
            unsigned int leaveloop;
            leaveloop = 0u;
            while(leaveloop==0u){
                if(atomicExch(lock, 1u) == 0u){
                    for(v = 0; v < num_edges; v++){
                        wt = (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) * (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) +
                                (Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) * (Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) + 
                                (Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) * (Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) ;
                        wt = sqrt(wt);

                        if(Ca[Ea[ind_Ea + v]] > (Ca[tid] + wt)){
                            Ca[Ea[ind_Ea + v]] = Ca[tid]  + wt;
                            Ma[Ea[ind_Ea + v]] = '1';
                        }
                        __threadfence();
                        leaveloop = 1u;
                        atomicExch(lock, 0u);
                    }
                }
            }
        }
    }

\uuuuu全局\uuuuuvoid cuda\u dijkstra\u内核\u1（浮点*Va、整数*Ea、整数*Sa、浮点*Ca、浮点*Ua、字符*Ma、无符号整数*lock）{
int tid=blockIdx.x；
如果（Ma[tid]='1'）{
Ma[tid]=“0”；
int ind_Ea=Sa[tid*2]；
int num_edges=Sa[（tid*2）+1]；
INTV；
浮点数wt=0；
无符号整数循环；
LEVELOOP=0u；
while（leveloop==0u）{
如果（原子EXCH（锁，1u）==0u）{
对于（v=0；v（Ca[tid]+wt））{
Ca[Ea[ind_Ea+v]=Ca[tid]+wt；
Ma[Ea[ind_Ea+v]='1'；
}
__螺纹围栏（）；
LEVELOOP=1u；
原子EXCH（锁，0u）；
}
}
}
}
}

问题在于Dijkstra算法的松弛阶段。我已经实施了这样一个阶段作为一个关键部分。如果有一个顶点（比如

）是多个顶点的邻居（即，连接到具有边的其他顶点），则这些顶点的所有线程将尝试写入成本数组

Ca

中顶点

的位置。现在我的目标是在该位置写入较小的值。为此，我尝试序列化进程并应用

\uuu threadfence（）

，以便一个线程写入的值对其他线程可见，然后最终较小的值保留在vertex

的位置。但问题是，这种逻辑是行不通的。vertex

的位置没有得到尝试写入该位置的所有线程的最小值，我不明白为什么。任何帮助都将不胜感激。

本文中包含了Dijkstra的单源最短路径（SSSP）算法在GPU上的一个“经典”（至少主要是参考）实现

Parwan Harish和p.J.Narayanan使用CUDA在GPU上加速大图算法

但是，该文件中的实现被认为存在缺陷，请参阅

Pedro J.Martín、Roberto Torres和Antonio Gavilanes为SSSP问题提供的CUDA解决方案

我在下面报告第一篇文章中建议的实现，根据第二篇文章的评论进行修正。该代码还包含一个C++版本。

#include <sstream>
#include <vector>
#include <iostream>
#include <stdio.h>
#include <float.h>

#include "Utilities.cuh"

#define NUM_ASYNCHRONOUS_ITERATIONS 20  // Number of async loop iterations before attempting to read results back

#define BLOCK_SIZE 16

/***********************/
/* GRAPHDATA STRUCTURE */
/***********************/
// --- The graph data structure is an adjacency list.
typedef struct {

    // --- Contains the integer offset to point to the edge list for each vertex
    int *vertexArray;

    // --- Overall number of vertices
    int numVertices;

    // --- Contains the "destination" vertices each edge is attached to
    int *edgeArray;

    // --- Overall number of edges
    int numEdges;

    // --- Contains the weight of each edge
    float *weightArray;

} GraphData;

/**********************************/
/* GENERATE RANDOM GRAPH FUNCTION */
/**********************************/
void generateRandomGraph(GraphData *graph, int numVertices, int neighborsPerVertex) {

    graph -> numVertices    = numVertices;
    graph -> vertexArray    = (int *)malloc(graph -> numVertices * sizeof(int));
    graph -> numEdges       = numVertices * neighborsPerVertex;
    graph -> edgeArray      = (int *)malloc(graph -> numEdges    * sizeof(int));
    graph -> weightArray    = (float *)malloc(graph -> numEdges  * sizeof(float));

    for (int i = 0; i < graph -> numVertices; i++) graph -> vertexArray[i] = i * neighborsPerVertex;

    int *tempArray = (int *)malloc(neighborsPerVertex * sizeof(int));
    for (int k = 0; k < numVertices; k++) {
        for (int l = 0; l < neighborsPerVertex; l++) tempArray[l] = INT_MAX;
        for (int l = 0; l < neighborsPerVertex; l++) {
            bool goOn = false;
            int temp;
            while (goOn == false) {
                goOn = true;
                temp = (rand() % graph->numVertices);
                for (int t = 0; t < neighborsPerVertex; t++)
                    if (temp == tempArray[t]) goOn = false;
                if (temp == k) goOn = false;
                if (goOn == true) tempArray[l] = temp;
            }
            graph -> edgeArray  [k * neighborsPerVertex + l] = temp;
            graph -> weightArray[k * neighborsPerVertex + l] = (float)(rand() % 1000) / 1000.0f;
        }
    }
}

/************************/
/* minDistance FUNCTION */
/************************/
// --- Finds the vertex with minimum distance value, from the set of vertices not yet included in shortest path tree
int minDistance(float *shortestDistances, bool *finalizedVertices, const int sourceVertex, const int N) {

    // --- Initialize minimum value
    int minIndex = sourceVertex;
    float min = FLT_MAX;

    for (int v = 0; v < N; v++)
        if (finalizedVertices[v] == false && shortestDistances[v] <= min) min = shortestDistances[v], minIndex = v;

    return minIndex;
}

/************************/
/* dijkstraCPU FUNCTION */
/************************/
void dijkstraCPU(float *graph, float *h_shortestDistances, int sourceVertex, const int N) {

    // --- h_finalizedVertices[i] is true if vertex i is included in the shortest path tree
    //     or the shortest distance from the source node to i is finalized
    bool *h_finalizedVertices = (bool *)malloc(N * sizeof(bool));

    // --- Initialize h_shortestDistancesances as infinite and h_shortestDistances as false
    for (int i = 0; i < N; i++) h_shortestDistances[i] = FLT_MAX, h_finalizedVertices[i] = false;

    // --- h_shortestDistancesance of the source vertex from itself is always 0
    h_shortestDistances[sourceVertex] = 0.f;

    // --- Dijkstra iterations
    for (int iterCount = 0; iterCount < N - 1; iterCount++) {

        // --- Selecting the minimum distance vertex from the set of vertices not yet
        //     processed. currentVertex is always equal to sourceVertex in the first iteration.
        int currentVertex = minDistance(h_shortestDistances, h_finalizedVertices, sourceVertex, N);

        // --- Mark the current vertex as processed
        h_finalizedVertices[currentVertex] = true;

        // --- Relaxation loop
        for (int v = 0; v < N; v++) {

            // --- Update dist[v] only if it is not in h_finalizedVertices, there is an edge
            //     from u to v, and the cost of the path from the source vertex to v through
            //     currentVertex is smaller than the current value of h_shortestDistances[v]
            if (!h_finalizedVertices[v] &&
                graph[currentVertex * N + v] &&
                h_shortestDistances[currentVertex] != FLT_MAX &&
                h_shortestDistances[currentVertex] + graph[currentVertex * N + v] < h_shortestDistances[v])

                h_shortestDistances[v] = h_shortestDistances[currentVertex] + graph[currentVertex * N + v];
        }
    }
}

/***************************/
/* MASKARRAYEMPTY FUNCTION */
/***************************/
// --- Check whether all the vertices have been finalized. This tells the algorithm whether it needs to continue running or not.
bool allFinalizedVertices(bool *finalizedVertices, int numVertices) {

    for (int i = 0; i < numVertices; i++)  if (finalizedVertices[i] == true) { return false; }

    return true;
}

/*************************/
/* ARRAY INITIALIZATIONS */
/*************************/
__global__ void initializeArrays(bool * __restrict__ d_finalizedVertices, float* __restrict__ d_shortestDistances, float* __restrict__ d_updatingShortestDistances,
                                 const int sourceVertex, const int numVertices) {

    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < numVertices) {

        if (sourceVertex == tid) {

            d_finalizedVertices[tid]            = true;
            d_shortestDistances[tid]            = 0.f;
            d_updatingShortestDistances[tid]    = 0.f; }

        else {

            d_finalizedVertices[tid]            = false;
            d_shortestDistances[tid]            = FLT_MAX;
            d_updatingShortestDistances[tid]    = FLT_MAX;
        }
    }
}

/**************************/
/* DIJKSTRA GPU KERNEL #1 */
/**************************/
__global__  void Kernel1(const int * __restrict__ vertexArray, const int* __restrict__ edgeArray,
                         const float * __restrict__ weightArray, bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances,
                         float * __restrict__ updatingShortestDistances, const int numVertices, const int numEdges) {

    int tid = blockIdx.x*blockDim.x + threadIdx.x;

    if (tid < numVertices) {

        if (finalizedVertices[tid] == true) {

            finalizedVertices[tid] = false;

            int edgeStart = vertexArray[tid], edgeEnd;

            if (tid + 1 < (numVertices)) edgeEnd = vertexArray[tid + 1];
            else                         edgeEnd = numEdges;

            for (int edge = edgeStart; edge < edgeEnd; edge++) {
                int nid = edgeArray[edge];
                atomicMin(&updatingShortestDistances[nid], shortestDistances[tid] + weightArray[edge]);
            }
        }
    }
}

/**************************/
/* DIJKSTRA GPU KERNEL #1 */
/**************************/
__global__  void Kernel2(const int * __restrict__ vertexArray, const int * __restrict__ edgeArray, const float* __restrict__ weightArray,
                         bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances, float* __restrict__ updatingShortestDistances,
                         const int numVertices) {

    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < numVertices) {

        if (shortestDistances[tid] > updatingShortestDistances[tid]) {
            shortestDistances[tid] = updatingShortestDistances[tid];
            finalizedVertices[tid] = true; }

        updatingShortestDistances[tid] = shortestDistances[tid];
    }
}

/************************/
/* dijkstraGPU FUNCTION */
/************************/
void dijkstraGPU(GraphData *graph, const int sourceVertex, float * __restrict__ h_shortestDistances) {

    // --- Create device-side adjacency-list, namely, vertex array Va, edge array Ea and weight array Wa from G(V,E,W)
    int     *d_vertexArray;         gpuErrchk(cudaMalloc(&d_vertexArray,    sizeof(int)   * graph -> numVertices));
    int     *d_edgeArray;           gpuErrchk(cudaMalloc(&d_edgeArray,  sizeof(int)   * graph -> numEdges));
    float   *d_weightArray;         gpuErrchk(cudaMalloc(&d_weightArray,    sizeof(float) * graph -> numEdges));

    // --- Copy adjacency-list to the device
    gpuErrchk(cudaMemcpy(d_vertexArray, graph -> vertexArray, sizeof(int)   * graph -> numVertices, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_edgeArray,   graph -> edgeArray,   sizeof(int)   * graph -> numEdges,    cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_weightArray, graph -> weightArray, sizeof(float) * graph -> numEdges,    cudaMemcpyHostToDevice));

    // --- Create mask array Ma, cost array Ca and updating cost array Ua of size V
    bool    *d_finalizedVertices;           gpuErrchk(cudaMalloc(&d_finalizedVertices,       sizeof(bool)   * graph->numVertices));
    float   *d_shortestDistances;           gpuErrchk(cudaMalloc(&d_shortestDistances,       sizeof(float) * graph->numVertices));
    float   *d_updatingShortestDistances;   gpuErrchk(cudaMalloc(&d_updatingShortestDistances, sizeof(float) * graph->numVertices));

    bool *h_finalizedVertices = (bool *)malloc(sizeof(bool) * graph->numVertices);

    // --- Initialize mask Ma to false, cost array Ca and Updating cost array Ua to \u221e
    initializeArrays <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_finalizedVertices, d_shortestDistances,
                                                            d_updatingShortestDistances, sourceVertex, graph -> numVertices);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    // --- Read mask array from device -> host
    gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost));

    while (!allFinalizedVertices(h_finalizedVertices, graph->numVertices)) {

        // --- In order to improve performance, we run some number of iterations without reading the results.  This might result
        //     in running more iterations than necessary at times, but it will in most cases be faster because we are doing less
        //     stalling of the GPU waiting for results.
        for (int asyncIter = 0; asyncIter < NUM_ASYNCHRONOUS_ITERATIONS; asyncIter++) {

            Kernel1 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances,
                                                            d_updatingShortestDistances, graph->numVertices, graph->numEdges);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
            Kernel2 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances, d_updatingShortestDistances,
                                                            graph->numVertices);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
        }

        gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost));
    }

    // --- Copy the result to host
    gpuErrchk(cudaMemcpy(h_shortestDistances, d_shortestDistances, sizeof(float) * graph->numVertices, cudaMemcpyDeviceToHost));

    free(h_finalizedVertices);

    gpuErrchk(cudaFree(d_vertexArray));
    gpuErrchk(cudaFree(d_edgeArray));
    gpuErrchk(cudaFree(d_weightArray));
    gpuErrchk(cudaFree(d_finalizedVertices));
    gpuErrchk(cudaFree(d_shortestDistances));
    gpuErrchk(cudaFree(d_updatingShortestDistances));
}

/****************/
/* MAIN PROGRAM */
/****************/
int main() {

    // --- Number of graph vertices
    int numVertices = 8;

    // --- Number of edges per graph vertex
    int neighborsPerVertex = 6;

    // --- Source vertex
    int sourceVertex = 0;

    // --- Allocate memory for arrays
    GraphData graph;
    generateRandomGraph(&graph, numVertices, neighborsPerVertex);

    // --- From adjacency list to adjacency matrix.
    //     Initializing the adjacency matrix
    float *weightMatrix = (float *)malloc(numVertices * numVertices * sizeof(float));
    for (int k = 0; k < numVertices * numVertices; k++) weightMatrix[k] = FLT_MAX;

    // --- Displaying the adjacency list and constructing the adjacency matrix
    printf("Adjacency list\n");
    for (int k = 0; k < numVertices; k++) weightMatrix[k * numVertices + k] = 0.f;
    for (int k = 0; k < numVertices; k++)
        for (int l = 0; l < neighborsPerVertex; l++) {
            weightMatrix[k * numVertices + graph.edgeArray[graph.vertexArray[k] + l]] = graph.weightArray[graph.vertexArray[k] + l];
            printf("Vertex nr. %i; Edge nr. %i; Weight = %f\n", k, graph.edgeArray[graph.vertexArray[k] + l],
                                                                   graph.weightArray[graph.vertexArray[k] + l]);
        }

    for (int k = 0; k < numVertices * neighborsPerVertex; k++)
        printf("%i %i %f\n", k, graph.edgeArray[k], graph.weightArray[k]);

    // --- Displaying the adjacency matrix
    printf("\nAdjacency matrix\n");
    for (int k = 0; k < numVertices; k++) {
        for (int l = 0; l < numVertices; l++)
            if (weightMatrix[k * numVertices + l] < FLT_MAX)
                printf("%1.3f\t", weightMatrix[k * numVertices + l]);
            else
                printf("--\t");
            printf("\n");
        }

    // --- Running Dijkstra on the CPU
    float *h_shortestDistancesCPU = (float *)malloc(numVertices * sizeof(float));
    dijkstraCPU(weightMatrix, h_shortestDistancesCPU, sourceVertex, numVertices);

    printf("\nCPU results\n");
    for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesCPU[k]);

    // --- Allocate space for the h_shortestDistancesGPU
    float *h_shortestDistancesGPU = (float*)malloc(sizeof(float) * graph.numVertices);
    dijkstraGPU(&graph, sourceVertex, h_shortestDistancesGPU);

    printf("\nGPU results\n");
    for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesGPU[k]);

    free(h_shortestDistancesCPU);
    free(h_shortestDistancesGPU);

    return 0;
}

#包括
#包括
#包括
#包括
#包括
#包括“Utilities.cuh”
#定义NUM_ASYNCHRONOUS_迭代次数20//尝试回读结果之前的异步循环迭代次数
#定义块大小16
/***********************/
/*图形数据结构*/
/***********************/
//---图形数据结构是一个邻接列表。
类型定义结构{
//---包含指向每个顶点的边列表的整数偏移
int*顶点阵列；
//---顶点总数
智力；
//---包含每条边附加到的“目标”顶点
国际*埃杰雷；
//---边缘的总数量
国际努梅德斯；
//---包含每条边的权重
浮点*加权数组；
}石墨数据；
/**********************************/
/*生成随机图函数*/
/**********************************/
void generateRandomGraph（GraphData*graph，int num属性，int neighborsPerVertex）{
图表->numVertices=numVertices；
graph->vertexArray=（int*）malloc（graph->numVertices*sizeof（int））；
graph->numEdges=numVertices*neighborsPerVertex；
graph->Edgerray=（int*）malloc（graph->numEdges*sizeof（int））；
graph->weightArray=（float*）malloc（graph->numEdges*sizeof（float））；
对于（inti=0；inumVertices；i++）graph->vertexArray[i]=i*neighterspervertex；
int*tempArray=（int*）malloc（nexterspervertex*sizeof（int））；
对于（int k=0；knum属性）；
for（int t=0；t边环[k*邻域Pervertex+l]=温度；
图形->权重数组[k*邻域Pervertex+l]=（浮点）（rand（）%1000）/1000.0f；
}
}
}
/************************/
/*思维距离函数*/
/************************/
//---从尚未包含在最短路径树中的顶点集中查找具有最小距离值的顶点
int minDistance（浮点*最短距离，布尔*最终顶点，常量int sourceVertex，常量int N）{
//---初始化最小值
int minIndex=源顶点；
最小浮动=最大浮动；
对于（int v=0；v