C++ 一维阵列是否比特征动态向量快？_C++_Arrays_Performance_Eigen

C++ 一维阵列是否比特征动态向量快？

c++ arrays performance

C++ 一维阵列是否比特征动态向量快？,c++,arrays,performance,eigen,C++,Arrays,Performance,Eigen,我使用大型矩阵（100x100到3000x3000）进行一些计算（大量求和和和多达120个矩阵向量乘法），我使用特征库来计算向量和矩阵我想知道怎样才能加快我的程序。我应该继续使用Eigen、使用1d数组、使用std:：vector还是使用其他库？假设您不想迁移到GPU，并且如果您想信任Eigen的页面，Eigen非常快。您特别提到，在您指定的范围内，Eigen位于顶部。确保OpenMP已启用，因为Eigen将利用。同样地，假设您不想迁移到GPU，并且如果您想信任Eigen的页面，Eigen非常

我使用大型矩阵（100x100到3000x3000）进行一些计算（大量求和和和多达120个矩阵向量乘法），我使用特征库来计算向量和矩阵

我想知道怎样才能加快我的程序。我应该继续使用Eigen、使用1d数组、使用std:：vector还是使用其他库？

假设您不想迁移到GPU，并且如果您想信任Eigen的页面，Eigen非常快。您特别提到，在您指定的范围内，Eigen位于顶部。确保OpenMP已启用，因为Eigen将利用。同样地，

假设您不想迁移到GPU，并且如果您想信任Eigen的页面，Eigen非常快。您特别提到，在您指定的范围内，Eigen位于顶部。确保OpenMP已启用，因为Eigen将利用。与.

一样，我做了一次比较，比较了Eigen和ViennaCL（都在调试中）：

使用的代码是：

//disable debug mechanisms to have a fair comparison with ublas:
#ifndef NDEBUG
#define NDEBUG
#endif


//
// include necessary system headers
//
#include <iostream>
#include <fstream>

//
// ublas includes
//
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>

#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
//
// Eigen includes
//
#include <Eigen/Core>
#include <Eigen/Dense>

// Must be set if you want to use ViennaCL algorithms on ublas objects
#define VIENNACL_WITH_UBLAS 1

//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
//
#define VIENNACL_WITH_EIGEN 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms using OPENCL
//
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif

//
// ViennaCL includes
//
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/compressed_matrix.hpp"
#include "viennacl/linalg/sparse_matrix_operations.hpp"
#include "viennacl/linalg/ilu.hpp"
#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/bicgstab.hpp"

// Some helper functions for this tutorial:
#include "examples/tutorial/Random.hpp"
#include "examples/tutorial/vector-io.hpp"

#include "examples/benchmarks/benchmark-utils.hpp"

#define BLAS3_MATRIX_SIZE   700

using namespace boost::numeric;


#ifndef VIENNACL_WITH_OPENCL
struct dummy
{
    std::size_t size() const { return 1; }
};
#endif


int main()
{
    typedef float     ScalarType;

    Timer timer;
    double exec_time;

    //
    // Initialize OpenCL device in the context
    //
    std::cout << std::endl << "--- initialized OpenGL device using ViennaCL ---" << std::endl;
#ifdef VIENNACL_WITH_OPENCL
    std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
    dummy devices;
#endif

#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[0]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    //// Output results file
    std::ofstream resultsFile;
    resultsFile.open("resultsFile.txt");
    resultsFile << "Processing Unit,Mat Size,Exec time" << std::endl;
    std::cout << "Processing Unit,Mat Size,Exec time" << std::endl;


    // Start defining the dense matrices
    size_t points = 230000;
    size_t transRows=4;
    size_t transCols=3;
    // Other alternative: Use Eigen
    Eigen::MatrixXf eigen_A(points, transRows);
    Eigen::MatrixXf eigen_B(transRows, transCols);
    // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    std::vector<ScalarType> stl_A(points * transRows);
    std::vector<ScalarType> stl_B(transRows * transCols);
    // Set up the ViennaCL object
    viennacl::matrix<ScalarType> vcl_A(points, transRows);
    viennacl::matrix<ScalarType> vcl_B(transRows, transCols);
    // Fill dense matrix in normal memory
    for (unsigned int i = 0; i < points; ++i)
    {
        for (unsigned int j = 0; j < transRows; ++j)
        {
            stl_A[i*transRows + j] = random<ScalarType>();
            eigen_A(i,j) = stl_A[i*transRows + j];
        }
    }
    for (unsigned int i = 0; i < transRows; ++i)
    {
        for (unsigned int j = 0; j < transCols; ++j)
        {
            stl_B[i*transCols + j] = random<ScalarType>();
            eigen_B(i,j) = stl_A[i*transCols + j];
        }
    }


    // Perform the matrix*matrix product
    // On CPU
    Eigen::MatrixXf eigen_C(points, transCols);
    timer.start();
    eigen_C = eigen_A * eigen_B;
    exec_time = timer.get();
    resultsFile << "CPU," << points << "," << exec_time << std::endl;
    std::cout << "CPU," << points << "," << exec_time << std::endl;

    // on GPU
    timer.start();
    // Copy to gpu memory
    // Using fastcopy I can get ~500x speed improvement
    viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
    viennacl::fast_copy(&(stl_B[0]),&(stl_B[0]) + stl_B.size(),vcl_B);

    viennacl::matrix<ScalarType> vcl_C(points, transCols);
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();

    exec_time = timer.get();
    resultsFile << "GPU," << points << "," << exec_time << std::endl;
    std::cout << "GPU," << points << "," << exec_time << std::endl;

    //// Start defining the dense matrices
    //for(size_t denseSize=10; denseSize<=1000; denseSize=denseSize*10)
    //{
    //  // Other alternative: Use Eigen
    //  Eigen::MatrixXf eigen_A(denseSize, denseSize);
    //  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //  std::vector<ScalarType> stl_A(denseSize * denseSize);
    //  // Set up the ViennaCL object
    //  viennacl::matrix<ScalarType> vcl_A(denseSize, denseSize);
    //  // Fill dense matrix in normal memory
    //  for (unsigned int i = 0; i < denseSize; ++i)
    //  {
    //      for (unsigned int j = 0; j < denseSize; ++j)
    //      {
    //          stl_A[i*denseSize + j] = random<ScalarType>();
    //          eigen_A(i,j) = stl_A[i*denseSize + j];
    //      }
    //  }


    //  // Perform the matrix*matrix product
    //  // On CPU
    //  Eigen::MatrixXf eigen_C(denseSize, denseSize);
    //  timer.start();
    //  eigen_C = eigen_A * eigen_A;
    //  exec_time = timer.get();
    //  resultsFile << "CPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "CPU," << denseSize << "," << exec_time << std::endl;

    //  // on GPU
    //  timer.start();
    //  // Copy to gpu memory
    //  // Using fastcopy I can get ~500x speed improvement
    //  viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //  viennacl::matrix<ScalarType> vcl_C(denseSize, denseSize);
    //  vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //  viennacl::backend::finish();

    //  exec_time = timer.get();
    //  resultsFile << "GPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "GPU," << denseSize << "," << exec_time << std::endl;
    //}

    //// Start defining the sparse matrices
    //for(size_t sparseSize=10; sparseSize<=1000; sparseSize=sparseSize*10)
    //{
    //  for(float sparsePerc=0.25; sparsePerc<=1.0; sparsePerc=2*sparsePerc)
    //  {
    //      // Other alternative: Use Eigen
    //      Eigen::SparseMatrix<float> eigen_A(sparseSize, sparseSize);
    //      // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //      std::vector<ScalarType> stl_A(sparseSize * sparseSize);
    //      // Set up the ViennaCL sparse matrix
    //      viennacl::matrix<ScalarType> vcl_A(sparseSize, sparseSize);
    //      // Fill dense matrix in normal memory
    //      for (size_t i=0; i<sparseSize; ++i)
    //      {
    //          for (size_t j=0; j<sparseSize; ++j)
    //          {
    //              if (((rand()%100)/100.0) <= sparsePerc)
    //              {
    //                  stl_A[i*sparseSize + j] = float(rand());
    //                  eigen_A.insert(i,j) = stl_A[i*sparseSize + j];
    //              }
    //          }
    //      }


    //      // Perform the matrix*matrix product
    //      // On CPU
    //      Eigen::SparseMatrix<float> eigen_C(sparseSize, sparseSize);
    //      timer.start();
    //      eigen_C = (eigen_A * eigen_A).pruned();
    //      exec_time = timer.get();
    //      resultsFile << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;

    //      // on GPU
    //      timer.start();
    //      // Copy to gpu memory
    //      // Using fastcopy I can get ~500x speed improvement
    //      viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //      viennacl::matrix<ScalarType> vcl_C(sparseSize, sparseSize);
    //      vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //      viennacl::backend::finish();

    //      exec_time = timer.get();
    //      resultsFile << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //  }
    //}

    resultsFile.close();


    //
    //  That's it. 
    //

    std::cout << "Press [ENTER] to exit " << std::endl;
    std::cin.get();

    return EXIT_SUCCESS;
}

//禁用调试机制以与ublas进行公平比较：
#ifndef NDEBUG
#定义NDEBUG
#恩迪夫
//
//包括必要的系统标题
//
#包括
#包括
//
//ublas包括
//
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义特征值是我知道稀疏模还不稳定
//
//本征包括
//
#包括
#包括
//如果要在ublas对象上使用ViennaCL算法，则必须设置
#用UBLAS 1定义VIENNACL_
//
//重要提示：如果要在特征对象上使用ViennaCL算法，必须在任何ViennaCL包含之前设置
//
#用特征值1定义VIENNACL_
//
//重要提示：如果要使用OPENCL使用ViennaCL算法，必须在任何ViennaCL包含之前设置
//
#ifndef VIENNACL_和OPENCL
#用OPENCL定义VIENNACL_
#恩迪夫
//
//维也纳包括
//
#包括“viennacl/scalar.hpp”
#包括“viennacl/vector.hpp”
#包括“viennacl/matrix.hpp”
#包括“viennacl/linalg/prod.hpp”
#包括“viennacl/compressed_matrix.hpp”
#包括“viennacl/linalg/sparse_matrix_operations.hpp”
#包括“viennacl/linalg/ilu.hpp”
#包括“viennacl/linalg/detail/ilu/block_ilu.hpp”
#包括“viennacl/linalg/direct_solve.hpp”
#包括“viennacl/linalg/Biggstab.hpp”
//本教程的一些辅助函数：
#包括“示例/教程/Random.hpp”
#包括“示例/教程/向量io.hpp”
#包括“示例/基准/基准utils.hpp”
#定义BLAS3_矩阵_大小700
使用名称空间boost:：numeric；
#ifndef VIENNACL_和OPENCL
结构虚拟
{
std:：size\u t size（）常量{return 1；}
};
#恩迪夫
int main（）
{
typedef float ScalarType；
定时器；
双执行时间；
//
//在上下文中初始化OpenCL设备
//
std:：cout我做了一次比较，比较了Eigen和ViennaCL（都在调试中）：
使用的代码是：
//disable debug mechanisms to have a fair comparison with ublas:
#ifndef NDEBUG
#define NDEBUG
#endif


//
// include necessary system headers
//
#include <iostream>
#include <fstream>

//
// ublas includes
//
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>

#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
//
// Eigen includes
//
#include <Eigen/Core>
#include <Eigen/Dense>

// Must be set if you want to use ViennaCL algorithms on ublas objects
#define VIENNACL_WITH_UBLAS 1

//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
//
#define VIENNACL_WITH_EIGEN 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms using OPENCL
//
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif

//
// ViennaCL includes
//
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/compressed_matrix.hpp"
#include "viennacl/linalg/sparse_matrix_operations.hpp"
#include "viennacl/linalg/ilu.hpp"
#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/bicgstab.hpp"

// Some helper functions for this tutorial:
#include "examples/tutorial/Random.hpp"
#include "examples/tutorial/vector-io.hpp"

#include "examples/benchmarks/benchmark-utils.hpp"

#define BLAS3_MATRIX_SIZE   700

using namespace boost::numeric;


#ifndef VIENNACL_WITH_OPENCL
struct dummy
{
    std::size_t size() const { return 1; }
};
#endif


int main()
{
    typedef float     ScalarType;

    Timer timer;
    double exec_time;

    //
    // Initialize OpenCL device in the context
    //
    std::cout << std::endl << "--- initialized OpenGL device using ViennaCL ---" << std::endl;
#ifdef VIENNACL_WITH_OPENCL
    std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
    dummy devices;
#endif

#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[0]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    //// Output results file
    std::ofstream resultsFile;
    resultsFile.open("resultsFile.txt");
    resultsFile << "Processing Unit,Mat Size,Exec time" << std::endl;
    std::cout << "Processing Unit,Mat Size,Exec time" << std::endl;


    // Start defining the dense matrices
    size_t points = 230000;
    size_t transRows=4;
    size_t transCols=3;
    // Other alternative: Use Eigen
    Eigen::MatrixXf eigen_A(points, transRows);
    Eigen::MatrixXf eigen_B(transRows, transCols);
    // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    std::vector<ScalarType> stl_A(points * transRows);
    std::vector<ScalarType> stl_B(transRows * transCols);
    // Set up the ViennaCL object
    viennacl::matrix<ScalarType> vcl_A(points, transRows);
    viennacl::matrix<ScalarType> vcl_B(transRows, transCols);
    // Fill dense matrix in normal memory
    for (unsigned int i = 0; i < points; ++i)
    {
        for (unsigned int j = 0; j < transRows; ++j)
        {
            stl_A[i*transRows + j] = random<ScalarType>();
            eigen_A(i,j) = stl_A[i*transRows + j];
        }
    }
    for (unsigned int i = 0; i < transRows; ++i)
    {
        for (unsigned int j = 0; j < transCols; ++j)
        {
            stl_B[i*transCols + j] = random<ScalarType>();
            eigen_B(i,j) = stl_A[i*transCols + j];
        }
    }


    // Perform the matrix*matrix product
    // On CPU
    Eigen::MatrixXf eigen_C(points, transCols);
    timer.start();
    eigen_C = eigen_A * eigen_B;
    exec_time = timer.get();
    resultsFile << "CPU," << points << "," << exec_time << std::endl;
    std::cout << "CPU," << points << "," << exec_time << std::endl;

    // on GPU
    timer.start();
    // Copy to gpu memory
    // Using fastcopy I can get ~500x speed improvement
    viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
    viennacl::fast_copy(&(stl_B[0]),&(stl_B[0]) + stl_B.size(),vcl_B);

    viennacl::matrix<ScalarType> vcl_C(points, transCols);
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();

    exec_time = timer.get();
    resultsFile << "GPU," << points << "," << exec_time << std::endl;
    std::cout << "GPU," << points << "," << exec_time << std::endl;

    //// Start defining the dense matrices
    //for(size_t denseSize=10; denseSize<=1000; denseSize=denseSize*10)
    //{
    //  // Other alternative: Use Eigen
    //  Eigen::MatrixXf eigen_A(denseSize, denseSize);
    //  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //  std::vector<ScalarType> stl_A(denseSize * denseSize);
    //  // Set up the ViennaCL object
    //  viennacl::matrix<ScalarType> vcl_A(denseSize, denseSize);
    //  // Fill dense matrix in normal memory
    //  for (unsigned int i = 0; i < denseSize; ++i)
    //  {
    //      for (unsigned int j = 0; j < denseSize; ++j)
    //      {
    //          stl_A[i*denseSize + j] = random<ScalarType>();
    //          eigen_A(i,j) = stl_A[i*denseSize + j];
    //      }
    //  }


    //  // Perform the matrix*matrix product
    //  // On CPU
    //  Eigen::MatrixXf eigen_C(denseSize, denseSize);
    //  timer.start();
    //  eigen_C = eigen_A * eigen_A;
    //  exec_time = timer.get();
    //  resultsFile << "CPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "CPU," << denseSize << "," << exec_time << std::endl;

    //  // on GPU
    //  timer.start();
    //  // Copy to gpu memory
    //  // Using fastcopy I can get ~500x speed improvement
    //  viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //  viennacl::matrix<ScalarType> vcl_C(denseSize, denseSize);
    //  vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //  viennacl::backend::finish();

    //  exec_time = timer.get();
    //  resultsFile << "GPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "GPU," << denseSize << "," << exec_time << std::endl;
    //}

    //// Start defining the sparse matrices
    //for(size_t sparseSize=10; sparseSize<=1000; sparseSize=sparseSize*10)
    //{
    //  for(float sparsePerc=0.25; sparsePerc<=1.0; sparsePerc=2*sparsePerc)
    //  {
    //      // Other alternative: Use Eigen
    //      Eigen::SparseMatrix<float> eigen_A(sparseSize, sparseSize);
    //      // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //      std::vector<ScalarType> stl_A(sparseSize * sparseSize);
    //      // Set up the ViennaCL sparse matrix
    //      viennacl::matrix<ScalarType> vcl_A(sparseSize, sparseSize);
    //      // Fill dense matrix in normal memory
    //      for (size_t i=0; i<sparseSize; ++i)
    //      {
    //          for (size_t j=0; j<sparseSize; ++j)
    //          {
    //              if (((rand()%100)/100.0) <= sparsePerc)
    //              {
    //                  stl_A[i*sparseSize + j] = float(rand());
    //                  eigen_A.insert(i,j) = stl_A[i*sparseSize + j];
    //              }
    //          }
    //      }


    //      // Perform the matrix*matrix product
    //      // On CPU
    //      Eigen::SparseMatrix<float> eigen_C(sparseSize, sparseSize);
    //      timer.start();
    //      eigen_C = (eigen_A * eigen_A).pruned();
    //      exec_time = timer.get();
    //      resultsFile << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;

    //      // on GPU
    //      timer.start();
    //      // Copy to gpu memory
    //      // Using fastcopy I can get ~500x speed improvement
    //      viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //      viennacl::matrix<ScalarType> vcl_C(sparseSize, sparseSize);
    //      vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //      viennacl::backend::finish();

    //      exec_time = timer.get();
    //      resultsFile << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //  }
    //}

    resultsFile.close();


    //
    //  That's it. 
    //

    std::cout << "Press [ENTER] to exit " << std::endl;
    std::cin.get();

    return EXIT_SUCCESS;
}

//禁用调试机制以与ublas进行公平比较：
#ifndef NDEBUG
#定义NDEBUG
#恩迪夫
//
//包括必要的系统标题
//
#包括
#包括
//
//ublas包括
//
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义特征值是我知道稀疏模还不稳定
//
//本征包括
//
#包括
#包括
//如果要在ublas对象上使用ViennaCL算法，则必须设置
#用UBLAS 1定义VIENNACL_
//
//重要提示：如果要在特征对象上使用ViennaCL算法，必须在任何ViennaCL包含之前设置
//
#用特征值1定义VIENNACL_
//
//重要提示：如果要使用OPENCL使用ViennaCL算法，必须在任何ViennaCL包含之前设置
//
#ifndef VIENNACL_和OPENCL
#用OPENCL定义VIENNACL_
#恩迪夫
//
//维也纳包括
//
#包括“viennacl/scalar.hpp”
#包括“viennacl/vector.hpp”
#包括“viennacl/matrix.hpp”
#包括“viennacl/linalg/prod.hpp”
#包括“viennacl/compressed_matrix.hpp”
#包括“viennacl/linalg/sparse_matrix_operations.hpp”
#包括“viennacl/linalg/ilu.hpp”
#包括“viennacl/linalg/detail/ilu/block_ilu.hpp”
#包括“viennacl/linalg/direct_solve.hpp”
#包括“viennacl/linalg/Biggstab.hpp”
//本教程的一些辅助函数：
#包括“示例/教程/Random.hpp”
#包括“示例/教程/向量io.hpp”
#包括“示例/基准/基准utils.hpp”
#定义BLAS3_矩阵_大小700
使用名称空间boost:：numeric；
#ifndef VIENNACL_和OPENCL
结构虚拟
{
std:：size\u t size（）常量{return 1；}
};
#恩迪夫
int main（）
{
typedef float ScalarType；
定时器；
双执行时间；
//
//在上下文中初始化OpenCL设备
//
std:：cout Re other LIB:可能有些多线程BLA也有稀疏矩阵实现。OpenCL替代方案：您可能想试试速度非常快、使用非常方便的方法。使用分析来找出瓶颈所在。不要猜测。您提到矩阵向量产品：确保向量确实是vector在编译时，如VectorXd
或MatrixXd
@NeilKirk的行/列，我在这里分析了这个问题：Re-other-libs：可能有些多线程BLA也有稀疏的矩阵实现。OpenCL替代方案：您可能想尝试一下速度非常快、使用相当简单的方法。使用分析来找出瓶颈在哪里。不要猜测。你提到矩阵向量积：确保向量在编译时确实是一个向量，比如VectorXd
或MatrixXd
的行/列@NeilKirk我在这里分析了这个问题：我实际上启用了它们（-fopenmp-msse2）顺便说一下，注释和分解函数，我发现了我的一个函数，它把一个矩阵作为一个AGECT并返回另一个函数，使用C++函数“Erf（double）”（元素），通过双“to”循环。，是否需要很长的计算时间，有没有想过如何加快计算速度？@Naucle提出一个有问题代码的问题，并尝试解决（和profi）