C++ 一维阵列是否比特征动态向量快?

C++ 一维阵列是否比特征动态向量快?,c++,arrays,performance,eigen,C++,Arrays,Performance,Eigen,我使用大型矩阵(100x100到3000x3000)进行一些计算(大量求和和和多达120个矩阵向量乘法),我使用特征库来计算向量和矩阵 我想知道怎样才能加快我的程序。我应该继续使用Eigen、使用1d数组、使用std::vector还是使用其他库?假设您不想迁移到GPU,并且如果您想信任Eigen的页面,Eigen非常快。您特别提到,在您指定的范围内,Eigen位于顶部。确保OpenMP已启用,因为Eigen将利用。同样地,假设您不想迁移到GPU,并且如果您想信任Eigen的页面,Eigen非常

我使用大型矩阵(100x100到3000x3000)进行一些计算(大量求和和和多达120个矩阵向量乘法),我使用特征库来计算向量和矩阵


我想知道怎样才能加快我的程序。我应该继续使用Eigen、使用1d数组、使用std::vector还是使用其他库?

假设您不想迁移到GPU,并且如果您想信任Eigen的页面,Eigen非常快。您特别提到,在您指定的范围内,Eigen位于顶部。确保OpenMP已启用,因为Eigen将利用。同样地,

假设您不想迁移到GPU,并且如果您想信任Eigen的页面,Eigen非常快。您特别提到,在您指定的范围内,Eigen位于顶部。确保OpenMP已启用,因为Eigen将利用。与.

一样,我做了一次比较,比较了Eigen和ViennaCL(都在调试中):

使用的代码是:

//disable debug mechanisms to have a fair comparison with ublas:
#ifndef NDEBUG
#define NDEBUG
#endif


//
// include necessary system headers
//
#include <iostream>
#include <fstream>

//
// ublas includes
//
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>

#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
//
// Eigen includes
//
#include <Eigen/Core>
#include <Eigen/Dense>

// Must be set if you want to use ViennaCL algorithms on ublas objects
#define VIENNACL_WITH_UBLAS 1

//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
//
#define VIENNACL_WITH_EIGEN 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms using OPENCL
//
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif

//
// ViennaCL includes
//
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/compressed_matrix.hpp"
#include "viennacl/linalg/sparse_matrix_operations.hpp"
#include "viennacl/linalg/ilu.hpp"
#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/bicgstab.hpp"

// Some helper functions for this tutorial:
#include "examples/tutorial/Random.hpp"
#include "examples/tutorial/vector-io.hpp"

#include "examples/benchmarks/benchmark-utils.hpp"

#define BLAS3_MATRIX_SIZE   700

using namespace boost::numeric;


#ifndef VIENNACL_WITH_OPENCL
struct dummy
{
    std::size_t size() const { return 1; }
};
#endif


int main()
{
    typedef float     ScalarType;

    Timer timer;
    double exec_time;

    //
    // Initialize OpenCL device in the context
    //
    std::cout << std::endl << "--- initialized OpenGL device using ViennaCL ---" << std::endl;
#ifdef VIENNACL_WITH_OPENCL
    std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
    dummy devices;
#endif

#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[0]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    //// Output results file
    std::ofstream resultsFile;
    resultsFile.open("resultsFile.txt");
    resultsFile << "Processing Unit,Mat Size,Exec time" << std::endl;
    std::cout << "Processing Unit,Mat Size,Exec time" << std::endl;


    // Start defining the dense matrices
    size_t points = 230000;
    size_t transRows=4;
    size_t transCols=3;
    // Other alternative: Use Eigen
    Eigen::MatrixXf eigen_A(points, transRows);
    Eigen::MatrixXf eigen_B(transRows, transCols);
    // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    std::vector<ScalarType> stl_A(points * transRows);
    std::vector<ScalarType> stl_B(transRows * transCols);
    // Set up the ViennaCL object
    viennacl::matrix<ScalarType> vcl_A(points, transRows);
    viennacl::matrix<ScalarType> vcl_B(transRows, transCols);
    // Fill dense matrix in normal memory
    for (unsigned int i = 0; i < points; ++i)
    {
        for (unsigned int j = 0; j < transRows; ++j)
        {
            stl_A[i*transRows + j] = random<ScalarType>();
            eigen_A(i,j) = stl_A[i*transRows + j];
        }
    }
    for (unsigned int i = 0; i < transRows; ++i)
    {
        for (unsigned int j = 0; j < transCols; ++j)
        {
            stl_B[i*transCols + j] = random<ScalarType>();
            eigen_B(i,j) = stl_A[i*transCols + j];
        }
    }


    // Perform the matrix*matrix product
    // On CPU
    Eigen::MatrixXf eigen_C(points, transCols);
    timer.start();
    eigen_C = eigen_A * eigen_B;
    exec_time = timer.get();
    resultsFile << "CPU," << points << "," << exec_time << std::endl;
    std::cout << "CPU," << points << "," << exec_time << std::endl;

    // on GPU
    timer.start();
    // Copy to gpu memory
    // Using fastcopy I can get ~500x speed improvement
    viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
    viennacl::fast_copy(&(stl_B[0]),&(stl_B[0]) + stl_B.size(),vcl_B);

    viennacl::matrix<ScalarType> vcl_C(points, transCols);
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();

    exec_time = timer.get();
    resultsFile << "GPU," << points << "," << exec_time << std::endl;
    std::cout << "GPU," << points << "," << exec_time << std::endl;

    //// Start defining the dense matrices
    //for(size_t denseSize=10; denseSize<=1000; denseSize=denseSize*10)
    //{
    //  // Other alternative: Use Eigen
    //  Eigen::MatrixXf eigen_A(denseSize, denseSize);
    //  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //  std::vector<ScalarType> stl_A(denseSize * denseSize);
    //  // Set up the ViennaCL object
    //  viennacl::matrix<ScalarType> vcl_A(denseSize, denseSize);
    //  // Fill dense matrix in normal memory
    //  for (unsigned int i = 0; i < denseSize; ++i)
    //  {
    //      for (unsigned int j = 0; j < denseSize; ++j)
    //      {
    //          stl_A[i*denseSize + j] = random<ScalarType>();
    //          eigen_A(i,j) = stl_A[i*denseSize + j];
    //      }
    //  }


    //  // Perform the matrix*matrix product
    //  // On CPU
    //  Eigen::MatrixXf eigen_C(denseSize, denseSize);
    //  timer.start();
    //  eigen_C = eigen_A * eigen_A;
    //  exec_time = timer.get();
    //  resultsFile << "CPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "CPU," << denseSize << "," << exec_time << std::endl;

    //  // on GPU
    //  timer.start();
    //  // Copy to gpu memory
    //  // Using fastcopy I can get ~500x speed improvement
    //  viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //  viennacl::matrix<ScalarType> vcl_C(denseSize, denseSize);
    //  vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //  viennacl::backend::finish();

    //  exec_time = timer.get();
    //  resultsFile << "GPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "GPU," << denseSize << "," << exec_time << std::endl;
    //}

    //// Start defining the sparse matrices
    //for(size_t sparseSize=10; sparseSize<=1000; sparseSize=sparseSize*10)
    //{
    //  for(float sparsePerc=0.25; sparsePerc<=1.0; sparsePerc=2*sparsePerc)
    //  {
    //      // Other alternative: Use Eigen
    //      Eigen::SparseMatrix<float> eigen_A(sparseSize, sparseSize);
    //      // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //      std::vector<ScalarType> stl_A(sparseSize * sparseSize);
    //      // Set up the ViennaCL sparse matrix
    //      viennacl::matrix<ScalarType> vcl_A(sparseSize, sparseSize);
    //      // Fill dense matrix in normal memory
    //      for (size_t i=0; i<sparseSize; ++i)
    //      {
    //          for (size_t j=0; j<sparseSize; ++j)
    //          {
    //              if (((rand()%100)/100.0) <= sparsePerc)
    //              {
    //                  stl_A[i*sparseSize + j] = float(rand());
    //                  eigen_A.insert(i,j) = stl_A[i*sparseSize + j];
    //              }
    //          }
    //      }


    //      // Perform the matrix*matrix product
    //      // On CPU
    //      Eigen::SparseMatrix<float> eigen_C(sparseSize, sparseSize);
    //      timer.start();
    //      eigen_C = (eigen_A * eigen_A).pruned();
    //      exec_time = timer.get();
    //      resultsFile << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;

    //      // on GPU
    //      timer.start();
    //      // Copy to gpu memory
    //      // Using fastcopy I can get ~500x speed improvement
    //      viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //      viennacl::matrix<ScalarType> vcl_C(sparseSize, sparseSize);
    //      vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //      viennacl::backend::finish();

    //      exec_time = timer.get();
    //      resultsFile << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //  }
    //}

    resultsFile.close();


    //
    //  That's it. 
    //

    std::cout << "Press [ENTER] to exit " << std::endl;
    std::cin.get();

    return EXIT_SUCCESS;
}
//禁用调试机制以与ublas进行公平比较:
#ifndef NDEBUG
#定义NDEBUG
#恩迪夫
//
//包括必要的系统标题
//
#包括
#包括
//
//ublas包括
//
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义特征值是我知道稀疏模还不稳定
//
//本征包括
//
#包括
#包括
//如果要在ublas对象上使用ViennaCL算法,则必须设置
#用UBLAS 1定义VIENNACL_
//
//重要提示:如果要在特征对象上使用ViennaCL算法,必须在任何ViennaCL包含之前设置
//
#用特征值1定义VIENNACL_
//
//重要提示:如果要使用OPENCL使用ViennaCL算法,必须在任何ViennaCL包含之前设置
//
#ifndef VIENNACL_和OPENCL
#用OPENCL定义VIENNACL_
#恩迪夫
//
//维也纳包括
//
#包括“viennacl/scalar.hpp”
#包括“viennacl/vector.hpp”
#包括“viennacl/matrix.hpp”
#包括“viennacl/linalg/prod.hpp”
#包括“viennacl/compressed_matrix.hpp”
#包括“viennacl/linalg/sparse_matrix_operations.hpp”
#包括“viennacl/linalg/ilu.hpp”
#包括“viennacl/linalg/detail/ilu/block_ilu.hpp”
#包括“viennacl/linalg/direct_solve.hpp”
#包括“viennacl/linalg/Biggstab.hpp”
//本教程的一些辅助函数:
#包括“示例/教程/Random.hpp”
#包括“示例/教程/向量io.hpp”
#包括“示例/基准/基准utils.hpp”
#定义BLAS3_矩阵_大小700
使用名称空间boost::numeric;
#ifndef VIENNACL_和OPENCL
结构虚拟
{
std::size\u t size()常量{return 1;}
};
#恩迪夫
int main()
{
typedef float ScalarType;
定时器;
双执行时间;
//
//在上下文中初始化OpenCL设备
//

std::cout我做了一次比较,比较了Eigen和ViennaCL(都在调试中):

使用的代码是:

//disable debug mechanisms to have a fair comparison with ublas:
#ifndef NDEBUG
#define NDEBUG
#endif


//
// include necessary system headers
//
#include <iostream>
#include <fstream>

//
// ublas includes
//
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>

#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
//
// Eigen includes
//
#include <Eigen/Core>
#include <Eigen/Dense>

// Must be set if you want to use ViennaCL algorithms on ublas objects
#define VIENNACL_WITH_UBLAS 1

//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
//
#define VIENNACL_WITH_EIGEN 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms using OPENCL
//
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif

//
// ViennaCL includes
//
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/compressed_matrix.hpp"
#include "viennacl/linalg/sparse_matrix_operations.hpp"
#include "viennacl/linalg/ilu.hpp"
#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/bicgstab.hpp"

// Some helper functions for this tutorial:
#include "examples/tutorial/Random.hpp"
#include "examples/tutorial/vector-io.hpp"

#include "examples/benchmarks/benchmark-utils.hpp"

#define BLAS3_MATRIX_SIZE   700

using namespace boost::numeric;


#ifndef VIENNACL_WITH_OPENCL
struct dummy
{
    std::size_t size() const { return 1; }
};
#endif


int main()
{
    typedef float     ScalarType;

    Timer timer;
    double exec_time;

    //
    // Initialize OpenCL device in the context
    //
    std::cout << std::endl << "--- initialized OpenGL device using ViennaCL ---" << std::endl;
#ifdef VIENNACL_WITH_OPENCL
    std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
    dummy devices;
#endif

#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[0]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    //// Output results file
    std::ofstream resultsFile;
    resultsFile.open("resultsFile.txt");
    resultsFile << "Processing Unit,Mat Size,Exec time" << std::endl;
    std::cout << "Processing Unit,Mat Size,Exec time" << std::endl;


    // Start defining the dense matrices
    size_t points = 230000;
    size_t transRows=4;
    size_t transCols=3;
    // Other alternative: Use Eigen
    Eigen::MatrixXf eigen_A(points, transRows);
    Eigen::MatrixXf eigen_B(transRows, transCols);
    // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    std::vector<ScalarType> stl_A(points * transRows);
    std::vector<ScalarType> stl_B(transRows * transCols);
    // Set up the ViennaCL object
    viennacl::matrix<ScalarType> vcl_A(points, transRows);
    viennacl::matrix<ScalarType> vcl_B(transRows, transCols);
    // Fill dense matrix in normal memory
    for (unsigned int i = 0; i < points; ++i)
    {
        for (unsigned int j = 0; j < transRows; ++j)
        {
            stl_A[i*transRows + j] = random<ScalarType>();
            eigen_A(i,j) = stl_A[i*transRows + j];
        }
    }
    for (unsigned int i = 0; i < transRows; ++i)
    {
        for (unsigned int j = 0; j < transCols; ++j)
        {
            stl_B[i*transCols + j] = random<ScalarType>();
            eigen_B(i,j) = stl_A[i*transCols + j];
        }
    }


    // Perform the matrix*matrix product
    // On CPU
    Eigen::MatrixXf eigen_C(points, transCols);
    timer.start();
    eigen_C = eigen_A * eigen_B;
    exec_time = timer.get();
    resultsFile << "CPU," << points << "," << exec_time << std::endl;
    std::cout << "CPU," << points << "," << exec_time << std::endl;

    // on GPU
    timer.start();
    // Copy to gpu memory
    // Using fastcopy I can get ~500x speed improvement
    viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
    viennacl::fast_copy(&(stl_B[0]),&(stl_B[0]) + stl_B.size(),vcl_B);

    viennacl::matrix<ScalarType> vcl_C(points, transCols);
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();

    exec_time = timer.get();
    resultsFile << "GPU," << points << "," << exec_time << std::endl;
    std::cout << "GPU," << points << "," << exec_time << std::endl;

    //// Start defining the dense matrices
    //for(size_t denseSize=10; denseSize<=1000; denseSize=denseSize*10)
    //{
    //  // Other alternative: Use Eigen
    //  Eigen::MatrixXf eigen_A(denseSize, denseSize);
    //  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //  std::vector<ScalarType> stl_A(denseSize * denseSize);
    //  // Set up the ViennaCL object
    //  viennacl::matrix<ScalarType> vcl_A(denseSize, denseSize);
    //  // Fill dense matrix in normal memory
    //  for (unsigned int i = 0; i < denseSize; ++i)
    //  {
    //      for (unsigned int j = 0; j < denseSize; ++j)
    //      {
    //          stl_A[i*denseSize + j] = random<ScalarType>();
    //          eigen_A(i,j) = stl_A[i*denseSize + j];
    //      }
    //  }


    //  // Perform the matrix*matrix product
    //  // On CPU
    //  Eigen::MatrixXf eigen_C(denseSize, denseSize);
    //  timer.start();
    //  eigen_C = eigen_A * eigen_A;
    //  exec_time = timer.get();
    //  resultsFile << "CPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "CPU," << denseSize << "," << exec_time << std::endl;

    //  // on GPU
    //  timer.start();
    //  // Copy to gpu memory
    //  // Using fastcopy I can get ~500x speed improvement
    //  viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //  viennacl::matrix<ScalarType> vcl_C(denseSize, denseSize);
    //  vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //  viennacl::backend::finish();

    //  exec_time = timer.get();
    //  resultsFile << "GPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "GPU," << denseSize << "," << exec_time << std::endl;
    //}

    //// Start defining the sparse matrices
    //for(size_t sparseSize=10; sparseSize<=1000; sparseSize=sparseSize*10)
    //{
    //  for(float sparsePerc=0.25; sparsePerc<=1.0; sparsePerc=2*sparsePerc)
    //  {
    //      // Other alternative: Use Eigen
    //      Eigen::SparseMatrix<float> eigen_A(sparseSize, sparseSize);
    //      // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //      std::vector<ScalarType> stl_A(sparseSize * sparseSize);
    //      // Set up the ViennaCL sparse matrix
    //      viennacl::matrix<ScalarType> vcl_A(sparseSize, sparseSize);
    //      // Fill dense matrix in normal memory
    //      for (size_t i=0; i<sparseSize; ++i)
    //      {
    //          for (size_t j=0; j<sparseSize; ++j)
    //          {
    //              if (((rand()%100)/100.0) <= sparsePerc)
    //              {
    //                  stl_A[i*sparseSize + j] = float(rand());
    //                  eigen_A.insert(i,j) = stl_A[i*sparseSize + j];
    //              }
    //          }
    //      }


    //      // Perform the matrix*matrix product
    //      // On CPU
    //      Eigen::SparseMatrix<float> eigen_C(sparseSize, sparseSize);
    //      timer.start();
    //      eigen_C = (eigen_A * eigen_A).pruned();
    //      exec_time = timer.get();
    //      resultsFile << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;

    //      // on GPU
    //      timer.start();
    //      // Copy to gpu memory
    //      // Using fastcopy I can get ~500x speed improvement
    //      viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //      viennacl::matrix<ScalarType> vcl_C(sparseSize, sparseSize);
    //      vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //      viennacl::backend::finish();

    //      exec_time = timer.get();
    //      resultsFile << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //  }
    //}

    resultsFile.close();


    //
    //  That's it. 
    //

    std::cout << "Press [ENTER] to exit " << std::endl;
    std::cin.get();

    return EXIT_SUCCESS;
}
//禁用调试机制以与ublas进行公平比较:
#ifndef NDEBUG
#定义NDEBUG
#恩迪夫
//
//包括必要的系统标题
//
#包括
#包括
//
//ublas包括
//
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义特征值是我知道稀疏模还不稳定
//
//本征包括
//
#包括
#包括
//如果要在ublas对象上使用ViennaCL算法,则必须设置
#用UBLAS 1定义VIENNACL_
//
//重要提示:如果要在特征对象上使用ViennaCL算法,必须在任何ViennaCL包含之前设置
//
#用特征值1定义VIENNACL_
//
//重要提示:如果要使用OPENCL使用ViennaCL算法,必须在任何ViennaCL包含之前设置
//
#ifndef VIENNACL_和OPENCL
#用OPENCL定义VIENNACL_
#恩迪夫
//
//维也纳包括
//
#包括“viennacl/scalar.hpp”
#包括“viennacl/vector.hpp”
#包括“viennacl/matrix.hpp”
#包括“viennacl/linalg/prod.hpp”
#包括“viennacl/compressed_matrix.hpp”
#包括“viennacl/linalg/sparse_matrix_operations.hpp”
#包括“viennacl/linalg/ilu.hpp”
#包括“viennacl/linalg/detail/ilu/block_ilu.hpp”
#包括“viennacl/linalg/direct_solve.hpp”
#包括“viennacl/linalg/Biggstab.hpp”
//本教程的一些辅助函数:
#包括“示例/教程/Random.hpp”
#包括“示例/教程/向量io.hpp”
#包括“示例/基准/基准utils.hpp”
#定义BLAS3_矩阵_大小700
使用名称空间boost::numeric;
#ifndef VIENNACL_和OPENCL
结构虚拟
{
std::size\u t size()常量{return 1;}
};
#恩迪夫
int main()
{
typedef float ScalarType;
定时器;
双执行时间;
//
//在上下文中初始化OpenCL设备
//

std::cout Re other LIB:可能有些多线程BLA也有稀疏矩阵实现。OpenCL替代方案:您可能想试试速度非常快、使用非常方便的方法。使用分析来找出瓶颈所在。不要猜测。您提到矩阵向量产品:确保向量确实是vector在编译时,如
VectorXd
MatrixXd
@NeilKirk的行/列,我在这里分析了这个问题:Re-other-libs:可能有些多线程BLA也有稀疏的矩阵实现。OpenCL替代方案:您可能想尝试一下速度非常快、使用相当简单的方法。使用分析来找出瓶颈在哪里。不要猜测。你提到矩阵向量积:确保向量在编译时确实是一个向量,比如
VectorXd
MatrixXd
的行/列@NeilKirk我在这里分析了这个问题:我实际上启用了它们(-fopenmp-msse2)顺便说一下,注释和分解函数,我发现了我的一个函数,它把一个矩阵作为一个AGECT并返回另一个函数,使用C++函数“Erf(double)”(元素),通过双“to”循环。,是否需要很长的计算时间,有没有想过如何加快计算速度?@Naucle提出一个有问题代码的问题,并尝试解决(和profi)