Matlab 如何在推力或立方阵中找到两个矩阵W和X之间的行到所有行距离矩阵？_Matlab_Cuda_Thrust_Cublas

Matlab 如何在推力或立方阵中找到两个矩阵W和X之间的行到所有行距离矩阵？

matlab cuda

Matlab 如何在推力或立方阵中找到两个矩阵W和X之间的行到所有行距离矩阵？,matlab,cuda,thrust,cublas,Matlab,Cuda,Thrust,Cublas,我有以下matlab代码 tempx = full(sum(X.^2, 2)); tempc = full(sum(C.^2, 2).'); D = -2*(X * C.'); D = bsxfun(@plus, D, tempx); D = bsxfun(@plus, D, tempc); 其中X是nxm，W是kxm矩阵。一个是数据，另一个是权重矩阵。我用给定的代码求距离矩阵D。我正在观察这一行动的有效实施。我成功地完成了这一行D=-2*（X*C.）由库布拉斯，但剩余部分仍然是一个问题作为一

我有以下matlab代码

tempx = full(sum(X.^2, 2));
tempc = full(sum(C.^2, 2).');
D = -2*(X * C.');
D = bsxfun(@plus, D, tempx);
D = bsxfun(@plus, D, tempc);

其中X是nxm，W是kxm矩阵。一个是数据，另一个是权重矩阵。我用给定的代码求距离矩阵D。我正在观察这一行动的有效实施。我成功地完成了这一行

D=-2*（X*C.）由库布拉斯，但剩余部分仍然是一个问题作为一个新手？有人能帮我写一个片段或提出建议吗
以下是我到目前为止的情况：
编辑：我添加了更多的代码，我需要类似bsxfun的求和实现。最后一步，求向量V和所有列的和，求V2和所有行的和
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cuda.h>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/system_error.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>

#define N 4
#define K 4
#define M 3

template <typename T>
struct square_op{
    __host__ __device__ T operator()(const T& x)const{
        return x*x;
    }
};


int main (void){
    cudaError_t cudaStat;
    cublasStatus_t stat;
    cublasHandle_t handle;

    stat = cublasCreate(&handle);
    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }
    // Fill with random data
    thrust::host_vector<float> C_h(N*K);

    thrust::host_vector<float> A_h(N*M);        //data matrix
    thrust::host_vector<float> B_h(K*M);        //weight matrix
    thrust::sequence(A_h.begin(),A_h.end());
    thrust::sequence(B_h.begin(),B_h.end());
    //  std::generate(A_h.begin(), A_h.end(), rand);
    //  std::generate(B_h.begin(), B_h.end(), rand);

    thrust::device_vector<float> A_d = A_h;
    thrust::device_vector<float> B_d = B_h;
    thrust::device_vector<float> C_d(N*K);
    thrust::device_vector<float> dummy_x(M,1);
    thrust::device_vector<float> A_sum_vec_d(N,0);
    thrust::device_vector<float> B_sum_vec_d(K,0);

    // TEST variables
    thrust::host_vector<float> A_sum_vec_h(N,0);
    thrust::host_vector<float> B_sum_vec_h(K,0);

    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < M; ++j) {
            printf("%f ",A_h[i*M+j]);
        }
        printf("\n");
    }

    printf("\n");

    for (int i = 0; i < K; ++i) {
        for (int j = 0; j < M; ++j) {
            printf("%f ",B_h[i*M+j]);
        }
        printf("\n");
    }

    printf("\n");

    std::cout<< "Starting GPU run" <<std::endl;  //add this line
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    //************************************
    // Calculate Square Elements
    //************************************
    square_op<float> unary_op = square_op<float>();
    thrust::transform(A_d.begin(),A_d.end(),A_d.begin(),unary_op);
    thrust::transform(B_d.begin(),B_d.end(),B_d.begin(),unary_op);

    // TEST
    thrust::copy(A_d.begin(),A_d.end(), A_h.begin());
    printf("Matrix A after square!!\n");
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < M; ++j) {
            printf("%f ",A_h[i*M+j]);
        }
        printf("\n");
    }

    printf("\n");

    thrust::copy(B_d.begin(),B_d.end(), B_h.begin());
    printf("Matrix B after square!!\n");
    for (int i = 0; i < K; ++i) {
        for (int j = 0; j < M; ++j) {
            printf("%f ",B_h[i*M+j]);
        }
        printf("\n");
    }

    printf("\n");

    //************************************
    // Sum of the Rows
    //************************************
    float alpha = 1.0f;
    float beta = 0.0f;

    stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,N,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1);
    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("1 CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }

    stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,K,&alpha,thrust::raw_pointer_cast(&B_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1);
    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("2 CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }

    // TEST
    thrust::copy(A_sum_vec_d.begin(), A_sum_vec_d.end(), A_sum_vec_h.begin());

    printf("A_vec after row sum!!\n");
    for (int j = 0; j < N; ++j) {
        printf("%f ",A_sum_vec_h[j]);
    }
    printf("\n \n");

    thrust::copy(B_sum_vec_d.begin(), B_sum_vec_d.end(), B_sum_vec_h.begin());

    printf("B_vec after row sum!!\n");
    for (int j = 0; j < K; ++j) {
        printf("%f ",B_sum_vec_h[j]);
    }
    printf("\n \n");



    //************************************
    // Matrix Multiplication
    //************************************
    alpha = 2.0f;
    beta = 0.0f;
    //alpha*(A*B')+beta in row_major_order
    stat = cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,N,K,M,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&B_d[0]), M, &beta,thrust::raw_pointer_cast(&C_d[0]), N);
    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    float totalTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    totalTime = elapsedTime/1000;
    std::cout<<"Elapsed_time:"<< totalTime<<std::endl;

    //copy back data
    thrust::copy(C_d.begin(),C_d.end(),C_h.begin());

    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < K; ++j) {
            printf("%f ",C_h[i*K+j]);
        }
        printf("\n");
    }

    //************************************
    // Final summation
    //************************************
    //.... NEED CODE

    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }

    printf("Execution ends!!\n");
    return EXIT_SUCCESS;
}

#包括
#包括
#包括
#包括
#包括“cublas_v2.h”
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义n4
#定义k4
#定义m3
样板
建筑广场{
__主机\uuuuuu\uuuu设备\uuuuu操作符（）（常量T&x）常量{
返回x*x；
}
};
内部主（空）{
cudaError_t cudaStat；
库布拉斯塔图斯统计局；
立方手柄；
stat=cublasCreate（&handle）；
if（stat！=CUBLAS\u STATUS\u SUCCESS）{
printf（“CUBLAS初始化失败！！\n”）；
返回退出失败；
}
//用随机数据填充
推力：主向量C_h（N*K）；
推力：：主机向量A_h（N*M）；//数据矩阵
推力：：主向量B_h（K*M）；//权重矩阵
顺序（A_h.begin（），A_h.end（））；
推力：序列（B_h.begin（），B_h.end（））；
//std:：generate（A_h.begin（），A_h.end（），rand）；
//std:：generate（B_h.begin（），B_h.end（），rand）；
推力：装置矢量A\u d=A\u h；
推力：设备_向量B_d=B_h；
推力：装置矢量C_d（N*K）；
推力：装置矢量虚拟（M，1）；
推力：设备向量A和向量d（N，0）；
推力：设备向量B和向量d（K，0）；
//测试变量
推力：主向量A和向量h（N，0）；
推力：主向量B和向量h（K，0）；
对于（int i=0；istd：：cout这是我的最终代码，它与我的描述一样工作。它可能不是最有效的代码，但至少可以工作
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cuda.h>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/system_error.h>
#include <thrust/sequence.h>
#include <thrust/device_free.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>

#define N 4
#define K 4
#define M 3


//----------------------------
// Find min argument of array
// ---------------------------

// C-style indexing
int ci(int row, int column, int nColumns) {
    return row*nColumns+column;
}

// Convert a linear index to a row index
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T>
{
    T C; // number of columns

    __host__ __device__
    linear_index_to_row_index(T C) : C(C) {}

    __host__ __device__
    T operator()(T i)
    {
        return i / C;
    }
};

typedef thrust::tuple<int,float> argMinType;

struct argMin : public thrust::binary_function<argMinType,argMinType,argMinType>
{
    __host__ __device__
    argMinType operator()(const argMinType& a, const argMinType& b) const
    {
        if (thrust::get<1>(a) < thrust::get<1>(b)){
            return a;
        } else {
            return b;
        }
    }
};

thrust::device_vector<argMinType> minsOfRowSpace(thrust::device_vector<float> A, int nRows, int nColumns) {
    // allocate storage for row argmins and indices
    thrust::device_vector<argMinType> row_argmins(nRows);
    thrust::device_vector<int> row_indices(nRows);

    // compute row argmins by finding argmin values with equal row indices
    thrust::reduce_by_key
    (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(nColumns)),
            thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(nColumns)) + (nRows*nColumns),
            thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0),A.begin())),
            row_indices.begin(),
            row_argmins.begin(),
            thrust::equal_to<int>(),
            argMin());
    return row_argmins;
}


template <typename T>
struct square_op{
    __host__ __device__ T operator()(const T& x)const{
        return x*x;
    }
};



int main (void){
    cublasStatus_t stat;
    cublasHandle_t handle;

    stat = cublasCreate(&handle);
    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }


    // Fill with random data
    thrust::host_vector<float> C_h(N*K);
    thrust::host_vector<argMinType> minOfC_h(N);
    thrust::host_vector<float> A_h(N*M);        //data matrix
    thrust::host_vector<float> B_h(K*M);        //weight matrix
    thrust::sequence(A_h.begin(),A_h.end());
    thrust::sequence(B_h.begin(),B_h.end());
    //  std::generate(A_h.begin(), A_h.end(), rand);
    //  std::generate(B_h.begin(), B_h.end(), rand);

    thrust::device_vector<float> A_d = A_h;
    thrust::device_vector<float> B_d = B_h;
    thrust::device_vector<float> C_d(N*K);
    thrust::device_vector<float> dummy_x(M,1);
    thrust::device_vector<float> A_sum_vec_d(N,0);
    thrust::device_vector<float> B_sum_vec_d(K,0);
    thrust::device_vector<argMinType> minsOfC_d(N);

    // TEST variables
    thrust::host_vector<float> A_sum_vec_h(N,0);
    thrust::host_vector<float> B_sum_vec_h(K,0);



    // TEST
    //  for (int i = 0; i < N; ++i) {
    //      for (int j = 0; j < M; ++j) {
    //          printf("%f ",A_h[i*M+j]);
    //      }
    //      printf("\n");
    //  }
    //
    //  printf("\n");
    //
    //  for (int i = 0; i < K; ++i) {
    //      for (int j = 0; j < M; ++j) {
    //          printf("%f ",B_h[i*M+j]);
    //      }
    //      printf("\n");
    //  }
    //
    //  printf("\n");

    std::cout<< "Starting GPU run" <<std::endl;  //add this line
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);


    //************************************
    // Matrix Multiplication
    //************************************
    float alpha = -2.0f;
    float beta = 0.0f;
    //alpha*(A*B')+beta in row_major_order
    stat = cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,N,K,M,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&B_d[0]), M, &beta,thrust::raw_pointer_cast(&C_d[0]), N);
    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }

    //TEST
    //  C_h = C_d;
    //  printf("After Matrix Multiplicaton\n");
    //  for (int i = 0; i < N; ++i) {
    //      for (int j = 0; j < K; ++j) {
    //          printf("%f ",C_h[i*K+j]);
    //      }
    //      printf("\n");
    //  }
    //  printf("\n");

    //************************************
    // Calculate Square Elements
    //************************************
    try{
        square_op<float> unary_op = square_op<float>();
        thrust::transform(A_d.begin(),A_d.end(),A_d.begin(),unary_op);
        thrust::transform(B_d.begin(),B_d.end(),B_d.begin(),unary_op);
    }catch(thrust::system_error &e)
    {
        // output an error message and exit
        std::cerr << "Error Transform square elements: " << e.what() << std::endl;
        exit(-1);
    }

    // TEST
    //  thrust::copy(A_d.begin(),A_d.end(), A_h.begin());
    //  printf("Matrix A after square!!\n");
    //  for (int i = 0; i < N; ++i) {
    //      for (int j = 0; j < M; ++j) {
    //          printf("%f ",A_h[i*M+j]);
    //      }
    //      printf("\n");
    //  }
    //
    //  printf("\n");
    //
    //  thrust::copy(B_d.begin(),B_d.end(), B_h.begin());
    //  printf("Matrix B after square!!\n");
    //  for (int i = 0; i < K; ++i) {
    //      for (int j = 0; j < M; ++j) {
    //          printf("%f ",B_h[i*M+j]);
    //      }
    //      printf("\n");
    //  }
    //
    //  printf("\n");

    //************************************
    // Sum of the Rows
    //************************************
    alpha = 1.0f;
    beta = 0.0f;

    stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,N,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1);
    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("1 CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }

    stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,K,&alpha,thrust::raw_pointer_cast(&B_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1);
    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("2 CUBLAS initialization failure!!\n");
        return EXIT_FAILURE;
    }

    //thrust::device_free(thrust::raw_pointer_cast(&dummy_x[0]));

    // TEST
    //  thrust::copy(A_sum_vec_d.begin(), A_sum_vec_d.end(), A_sum_vec_h.begin());
    //
    //  printf("A_vec after row sum!!\n");
    //  for (int j = 0; j < N; ++j) {
    //      printf("%f ",A_sum_vec_h[j]);
    //  }
    //  printf("\n \n");
    //
    //  thrust::copy(B_sum_vec_d.begin(), B_sum_vec_d.end(), B_sum_vec_h.begin());
    //
    //  printf("B_vec after row sum!!\n");
    //  for (int j = 0; j < K; ++j) {
    //      printf("%f ",B_sum_vec_h[j]);
    //  }
    //  printf("\n \n");

    //************************************
    // Final summation
    //************************************
    thrust::device_vector<float> dummy_y(N,1);
    alpha = 1.0f;

    // Column-wise sum
    stat = cublasSger_v2(handle,K,N,&alpha,thrust::raw_pointer_cast(&dummy_y[0]),1,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1,thrust::raw_pointer_cast(&C_d[0]),K);

    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("CUBLAS final summation failure!!\n");
        return EXIT_FAILURE;
    }

    // Row-wise sum
    stat = cublasSger_v2(handle,K,N,&alpha,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1,thrust::raw_pointer_cast(&dummy_y[0]),1,thrust::raw_pointer_cast(&C_d[0]),K);

    if (stat != CUBLAS_STATUS_SUCCESS){
        printf("CUBLAS final summation failure!!\n");
        return EXIT_FAILURE;
    }

    //thrust::device_free(&dummy_y[0]);

    // TEST
    //  printf("C matrix after first final sum\n");
    //  C_h = C_d;
    //  for (int i = 0; i < N; ++i) {
    //      for (int j = 0; j < K; ++j) {
    //          printf("%f ",C_h[i*K+j]);
    //      }
    //      printf("\n");
    //  }
    //  printf("\n");

    //*****************************
    // Find min elements
    // ****************************
    minsOfC_d=minsOfRowSpace(C_d, N, K);
    //thrust::copy(minsOfC_d.begin(),minsOfC_d.end(),minOfC_h.begin());
    minOfC_h = minsOfC_d;

    // TEST
    //  for (size_t i = 0; i < N; i++){
    //      for (size_t j = 0; j < 1; j++){
    //          argMinType tmp=minOfC_h[ci(i,j,1)];
    //          std::cout << thrust::get<0>(tmp) << " ";
    //      }
    //      std::cout << "\n";
    //  }
    //  std::cout << "\n";

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    float totalTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    totalTime = elapsedTime/1000;
    std::cout<<"Elapsed_time:"<< totalTime<<std::endl;


    printf("Execution ends!!\n");
    return EXIT_SUCCESS;
}

#包括
#包括
#包括
#包括
#包括“cublas_v2.h”
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义n4
#定义k4
#定义m3
//----------------------------
//查找数组的最小参数
// ---------------------------
//C型索引
int ci（int行、int列、int列）{
返回行*n列+列；
}
//将线性索引转换为行索引
样板
结构线性索引到行索引：公共推力：：一元函数
{
tc；//列数
__主机设备__
线性索引到行索引（tc）：C（C）{
__主机设备__
T运算符（）（ti）
{
返回i/C；
}
};
typedef推力：：元组argMinType；
结构argMin:公共推力：：二进制函数
{
__主机设备__
argMinType运算符（）（常量argMinType&a、常量argMinType&b）常量
{
如果（推力：获取（a）<推力：获取（b））{
返回a；
}否则{
返回b；
}
}
};
推力：：设备向量minsOfRowSpace（推力：：设备向量A，int nRows，int nColumns）{
//为行argmin和索引分配存储
推力：装置矢量行（nRows）；
推力：装置矢量行指数（nRows）；
//通过查找具有相等行索引的argmin值来计算行argmin
推力：通过按键减少推力
（推力：：生成转换迭代器（推力：：计数迭代器（0），线性索引到行索引（nColumns）），
推力：：使_变换_迭代器（推力：：计数_迭代器（0），线性_索引_到_行_索引（nColumns））+（nRows*nColumns），
推力：：生成zip迭代器（推力：：生成元组（推力：：计数迭代器（0），A.begin（）），
行索引。开始（），
行_argmins.begin（），
推力：：等于（），
argMin（））；
返回行_argmins；
}
样板
建筑广场{
__主机\uuuuuu\uuuu设备\uuuuu操作符（）（常量T&x）常量{
返回x*x；
}
};
内部主（空）{
库布拉斯塔图斯统计局；
立方手柄；
stat=cublasCreate（&handle）；
if（stat！=CUBLAS\u STATUS\u SUCCESS）{
printf（“CUBLAS初始化失败！！\n”）；
返回退出失败；
}
//用随机数据填充
推力：主向量C_h（N*K）；
推力：主向量minOfC_h（N）；
推力：：主机向量A_h（N*M）；//数据矩阵
推力：：主向量B_h（K*M）；//权重矩阵
顺序（A_h.begin（），A_h.end（））；
推力：序列（B_h.begin（），B_h.end（））；
//std:：generate（A_h.begin（），A_h.end（），rand）；
//std:：generate（B_h.begin（），B_h.end（），rand）；
推力：装置矢量A\u d=A\u h；
推力：设备_向量B_d=B_h；
推力：装置矢量C_d（N*K）；
推力：装置矢量虚拟（M，1）；
推力：设备向量A和向量d（N，0）；
推力：设备向量B和向量d（K，0）；
推力：装置向量minsOfC（N）；
//测试变量
推力：主向量A和向量h（N，0）；
推力：主向量B和向量h（K，0）；
//试验
//对于（int i=0；i