Cuda 推力副本-输出器柱主订单_Cuda_Thrust

Cuda 推力副本-输出器柱主订单

cuda

Cuda 推力副本-输出器柱主订单,cuda,thrust,Cuda,Thrust,我有一个矩阵向量，存储为列主数组，我想垂直合并。因此，我想利用推力框架中的copy函数，如以下示例片段所示： int offset = 0; for(int i = 0; i < matrices.size(); ++i) { thrust::copy( thrust::device_ptr<float>(matrices[i]), thrust::device_ptr<float>(matrices[i]) + rows[i

我有一个矩阵向量，存储为列主数组，我想垂直合并。因此，我想利用推力框架中的copy函数，如以下示例片段所示：

int offset = 0;
for(int i = 0; i < matrices.size(); ++i) {
    thrust::copy(
        thrust::device_ptr<float>(matrices[i]),
        thrust::device_ptr<float>(matrices[i]) + rows[i] * cols[i],
        thrust::device_ptr<float>(result) + offset
    );

    offset += rows[i] * cols[i];
}

编辑：扩展示例：

问题是，如果我有一个矩阵a=[[1,2,3]，[4,5,6]]2行，3列；在内存中[1,4,2,5,3,6]和另一个B=[[7,8,9]]1行，3列；在存储器[7,8,9]中，得到的矩阵C不是[[1,2,3]，[4,5,6]，[7,8,9]]3行，3列；在内存中[1,4,7,2,5,8,3,6,9]，但是[1,5,7]，[4,3,8]，[2,6,9]]3行，3列；在内存中[1,4,2,5,3,6,7,8,9]

有没有一种方法可以为这个问题创建一个特殊的输出计算器？我已经搜索过了，但什么也没找到，或者是一种快速的替代方法

编辑：SSCCE

#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/device_vector.h>
#include <iostream>

void printMat2d(thrust::device_vector<float>& mat, int rows, int cols) {
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            std::cout << mat[row + col * rows] << " ";
        }
        std::cout << std::endl;
    }
}

void printMat1d(thrust::device_vector<float>& mat, int rows, int cols) {
    for(int idx = 0; idx < cols*rows; ++idx) {
            std::cout << mat[idx] << " ";
    }
    std::cout << std::endl;
}

void generateMat(thrust::device_vector<float>& mat, int rows, int cols, int add) {
    thrust::host_vector<float> matHost(rows * cols);
    int val = 0;
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            matHost[row + col * rows] = val + add;
            val++;
        }
    }
    mat = matHost;
}

int main() {
    std::vector<int> rows(2);
    rows[0] = 2;
    rows[1] = 3;
    std::vector<int> cols(2);
    cols[0] = 3;
    cols[1] = 3;

    //generate matrices
    std::vector<thrust::device_vector<float> > matrices(2);
    for(size_t i = 0; i < matrices.size(); ++i) {
        generateMat(matrices[i], rows[i], cols[i], i*10);

        std::cout << "mat_ " << i << " = " << std::endl;
        printMat2d(matrices[i], rows[i], cols[i]);
        printMat1d(matrices[i], rows[i], cols[i]);
    }

    //copy
    int resultRows = 5;
    int resultCols = 3;
    thrust::device_vector<float> result(resultRows * resultCols);
    int offset = 0;
    for(int i = 0; i < matrices.size(); ++i) {
        thrust::copy(
            matrices[i].begin(),
            matrices[i].end(),
            result.begin() + offset
        );

        offset += rows[i] * cols[i];
    }

    std::cout << "result = " << std::endl;
    printMat2d(result, resultRows, resultCols);
    printMat1d(result, resultRows, resultCols);

    return 0;
}

编辑：我已经用一种稍有不同的方法替换了我以前使用每行跨步范围方法的答案，即将复制操作降低到每个要复制的矩阵的单个推力调用

这里的关键思想是使用一个函子将行主内存索引转换为列主内存索引。然后，该函子可以与计数迭代器一起使用，通过make_transform_迭代器创建任意行主到列主内存索引。然后，这些索引可以在源矩阵的置换迭代器中用于选择要复制的元素，在目标矩阵的置换迭代器中用于选择要复制到的内存位置。有关变换迭代器、计数迭代器和置换迭代器的概述，请参阅。我碰巧在这个练习中使用了CUDA 5.0和推力1.5.3

#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <iostream>

struct rm2cm_idx_functor : public thrust::unary_function<int, int>
{
  int r;
  int c;

  rm2cm_idx_functor(int _r, int _c) : r(_r), c(_c) {};

  __host__ __device__
  int operator() (int idx)  {
    unsigned my_r = idx/c;
    unsigned my_c = idx%c;
    return (my_c * r) + my_r;
  }
};

typedef float my_type;


void printMat2d(thrust::device_vector<my_type>& mat, int rows, int cols) {
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            std::cout << mat[row + col * rows] << " ";
        }
        std::cout << std::endl;
    }
}

void printMat1d(thrust::device_vector<my_type>& mat, int rows, int cols) {
    for(int idx = 0; idx < cols*rows; ++idx) {
            std::cout << mat[idx] << " ";
    }
    std::cout << std::endl;
}

void generateMat(thrust::device_vector<my_type>& mat, int rows, int cols, int add) {
    thrust::host_vector<my_type> matHost(rows * cols);
    int val = 0;
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            matHost[row + col * rows] = val + add;
            val++;
        }
    }
    mat = matHost;
}


void copyMat(thrust::device_vector<my_type>& src, thrust::device_vector<my_type>& dst, unsigned src_rows, unsigned src_cols, unsigned dst_rows, unsigned offset){
   thrust::copy_n(thrust::make_permutation_iterator(src.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(src_rows, src_cols))), src_rows*src_cols, thrust::make_permutation_iterator(dst.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(offset), rm2cm_idx_functor(dst_rows, src_cols))));
}



int main() {
    std::vector<int> rows(2);
    rows[0] = 2;
    rows[1] = 3;
    std::vector<int> cols(2);
    cols[0] = 3;
    cols[1] = 3;

    //generate matrices
    std::vector<thrust::device_vector<my_type> > matrices(2);
    for(size_t i = 0; i < matrices.size(); ++i) {
        generateMat(matrices[i], rows[i], cols[i], i*10);

        std::cout << "mat_ " << i << " = " << std::endl;
        printMat2d(matrices[i], rows[i], cols[i]);
        printMat1d(matrices[i], rows[i], cols[i]);
    }

    //copy
    int resultRows = 5;
    int resultCols = 3;
    thrust::device_vector<my_type> result(resultRows * resultCols);
    int offset = 0;

    for(int i = 0; i < matrices.size(); ++i) {
      copyMat(matrices[i], result, rows[i], cols[i], resultRows, offset);
      offset += rows[i]*cols[i];
    }


    std::cout << "result = " << std::endl;
    printMat2d(result, resultRows, resultCols);
    printMat1d(result, resultRows, resultCols);

    return 0;
}

这还假设源列==目标列，这似乎隐含在问题语句中。标准警告：并不是说这是无bug的，但它似乎适用于原始问题陈述中内置的测试用例

这种方法可能还可以进一步改进。现在，与推力：：copy\n调用关联的读操作和写操作都将被取消协调。我们可以通过合并这两个操作中的一个来进一步改进这一点。这就需要将读取和写入的索引转换函子的效果合并到一个映射函子中，该函子同时考虑了源维度和目标维度。对于单个映射函子，copy\n调用的第一项可能只是源向量。我认为也可以交替使用推力：：聚集或推力：：分散。但是，我还没有完全弄清楚。

你能添加一个简短完整的例子来说明你想要实现的目标吗？你真的想做一个矩阵变换的等价物吗？即在复制操作期间从列主项到行主项重新排序？我扩展了上面的例子。问题是，我想将两个或多个矩阵垂直合并/叠加到结果矩阵中。因此，我将第一个矩阵复制到结果中，以此类推，但由于矩阵以列主格式存储，因此结果的顺序错误。请提供完整的示例。我可以复制、编译和运行的东西，而无需添加或更改任何内容。因此：1.关于您编写的代码问题的问题必须在问题本身中描述特定的问题，并包括重现问题的有效代码。请参阅以获取指导。现在提供了一个sscce。如果不测试它，我会认为它比一个非常简单的、自写的内核要慢，因为重复的内核启动/复制调用？没有办法写一个输出迭代器吗？而且，是的，我的意思是垂直连接。我在问题中纠正了这一点。是的，我确信它会比定制内核慢。我认为它将比您显示的代码快得多，特别是对于大型矩阵。我并没有说一个抄袭电话就没办法做到，我只是说我还没弄明白。也许其他人会有想法。我已经更新了我的答案，以显示一种新方法，该方法将复制操作简化为对要复制的每个源矩阵的推力：：copy\n的单个调用。