C++ 一维阵列是否比特征动态向量快?
我使用大型矩阵(100x100到3000x3000)进行一些计算(大量求和和和多达120个矩阵向量乘法),我使用特征库来计算向量和矩阵C++ 一维阵列是否比特征动态向量快?,c++,arrays,performance,eigen,C++,Arrays,Performance,Eigen,我使用大型矩阵(100x100到3000x3000)进行一些计算(大量求和和和多达120个矩阵向量乘法),我使用特征库来计算向量和矩阵 我想知道怎样才能加快我的程序。我应该继续使用Eigen、使用1d数组、使用std::vector还是使用其他库?假设您不想迁移到GPU,并且如果您想信任Eigen的页面,Eigen非常快。您特别提到,在您指定的范围内,Eigen位于顶部。确保OpenMP已启用,因为Eigen将利用。同样地,假设您不想迁移到GPU,并且如果您想信任Eigen的页面,Eigen非常
我想知道怎样才能加快我的程序。我应该继续使用Eigen、使用1d数组、使用std::vector还是使用其他库?假设您不想迁移到GPU,并且如果您想信任Eigen的页面,Eigen非常快。您特别提到,在您指定的范围内,Eigen位于顶部。确保OpenMP已启用,因为Eigen将利用。同样地,假设您不想迁移到GPU,并且如果您想信任Eigen的页面,Eigen非常快。您特别提到,在您指定的范围内,Eigen位于顶部。确保OpenMP已启用,因为Eigen将利用。与.一样,我做了一次比较,比较了Eigen和ViennaCL(都在调试中): 使用的代码是:
//disable debug mechanisms to have a fair comparison with ublas:
#ifndef NDEBUG
#define NDEBUG
#endif
//
// include necessary system headers
//
#include <iostream>
#include <fstream>
//
// ublas includes
//
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>
#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
//
// Eigen includes
//
#include <Eigen/Core>
#include <Eigen/Dense>
// Must be set if you want to use ViennaCL algorithms on ublas objects
#define VIENNACL_WITH_UBLAS 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
//
#define VIENNACL_WITH_EIGEN 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms using OPENCL
//
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif
//
// ViennaCL includes
//
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/compressed_matrix.hpp"
#include "viennacl/linalg/sparse_matrix_operations.hpp"
#include "viennacl/linalg/ilu.hpp"
#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/bicgstab.hpp"
// Some helper functions for this tutorial:
#include "examples/tutorial/Random.hpp"
#include "examples/tutorial/vector-io.hpp"
#include "examples/benchmarks/benchmark-utils.hpp"
#define BLAS3_MATRIX_SIZE 700
using namespace boost::numeric;
#ifndef VIENNACL_WITH_OPENCL
struct dummy
{
std::size_t size() const { return 1; }
};
#endif
int main()
{
typedef float ScalarType;
Timer timer;
double exec_time;
//
// Initialize OpenCL device in the context
//
std::cout << std::endl << "--- initialized OpenGL device using ViennaCL ---" << std::endl;
#ifdef VIENNACL_WITH_OPENCL
std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
dummy devices;
#endif
#ifdef VIENNACL_WITH_OPENCL
viennacl::ocl::current_context().switch_device(devices[0]);
std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif
//// Output results file
std::ofstream resultsFile;
resultsFile.open("resultsFile.txt");
resultsFile << "Processing Unit,Mat Size,Exec time" << std::endl;
std::cout << "Processing Unit,Mat Size,Exec time" << std::endl;
// Start defining the dense matrices
size_t points = 230000;
size_t transRows=4;
size_t transCols=3;
// Other alternative: Use Eigen
Eigen::MatrixXf eigen_A(points, transRows);
Eigen::MatrixXf eigen_B(transRows, transCols);
// One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
std::vector<ScalarType> stl_A(points * transRows);
std::vector<ScalarType> stl_B(transRows * transCols);
// Set up the ViennaCL object
viennacl::matrix<ScalarType> vcl_A(points, transRows);
viennacl::matrix<ScalarType> vcl_B(transRows, transCols);
// Fill dense matrix in normal memory
for (unsigned int i = 0; i < points; ++i)
{
for (unsigned int j = 0; j < transRows; ++j)
{
stl_A[i*transRows + j] = random<ScalarType>();
eigen_A(i,j) = stl_A[i*transRows + j];
}
}
for (unsigned int i = 0; i < transRows; ++i)
{
for (unsigned int j = 0; j < transCols; ++j)
{
stl_B[i*transCols + j] = random<ScalarType>();
eigen_B(i,j) = stl_A[i*transCols + j];
}
}
// Perform the matrix*matrix product
// On CPU
Eigen::MatrixXf eigen_C(points, transCols);
timer.start();
eigen_C = eigen_A * eigen_B;
exec_time = timer.get();
resultsFile << "CPU," << points << "," << exec_time << std::endl;
std::cout << "CPU," << points << "," << exec_time << std::endl;
// on GPU
timer.start();
// Copy to gpu memory
// Using fastcopy I can get ~500x speed improvement
viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
viennacl::fast_copy(&(stl_B[0]),&(stl_B[0]) + stl_B.size(),vcl_B);
viennacl::matrix<ScalarType> vcl_C(points, transCols);
vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
viennacl::backend::finish();
exec_time = timer.get();
resultsFile << "GPU," << points << "," << exec_time << std::endl;
std::cout << "GPU," << points << "," << exec_time << std::endl;
//// Start defining the dense matrices
//for(size_t denseSize=10; denseSize<=1000; denseSize=denseSize*10)
//{
// // Other alternative: Use Eigen
// Eigen::MatrixXf eigen_A(denseSize, denseSize);
// // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
// std::vector<ScalarType> stl_A(denseSize * denseSize);
// // Set up the ViennaCL object
// viennacl::matrix<ScalarType> vcl_A(denseSize, denseSize);
// // Fill dense matrix in normal memory
// for (unsigned int i = 0; i < denseSize; ++i)
// {
// for (unsigned int j = 0; j < denseSize; ++j)
// {
// stl_A[i*denseSize + j] = random<ScalarType>();
// eigen_A(i,j) = stl_A[i*denseSize + j];
// }
// }
// // Perform the matrix*matrix product
// // On CPU
// Eigen::MatrixXf eigen_C(denseSize, denseSize);
// timer.start();
// eigen_C = eigen_A * eigen_A;
// exec_time = timer.get();
// resultsFile << "CPU," << denseSize << "," << exec_time << std::endl;
// std::cout << "CPU," << denseSize << "," << exec_time << std::endl;
// // on GPU
// timer.start();
// // Copy to gpu memory
// // Using fastcopy I can get ~500x speed improvement
// viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
// viennacl::matrix<ScalarType> vcl_C(denseSize, denseSize);
// vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
// viennacl::backend::finish();
// exec_time = timer.get();
// resultsFile << "GPU," << denseSize << "," << exec_time << std::endl;
// std::cout << "GPU," << denseSize << "," << exec_time << std::endl;
//}
//// Start defining the sparse matrices
//for(size_t sparseSize=10; sparseSize<=1000; sparseSize=sparseSize*10)
//{
// for(float sparsePerc=0.25; sparsePerc<=1.0; sparsePerc=2*sparsePerc)
// {
// // Other alternative: Use Eigen
// Eigen::SparseMatrix<float> eigen_A(sparseSize, sparseSize);
// // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
// std::vector<ScalarType> stl_A(sparseSize * sparseSize);
// // Set up the ViennaCL sparse matrix
// viennacl::matrix<ScalarType> vcl_A(sparseSize, sparseSize);
// // Fill dense matrix in normal memory
// for (size_t i=0; i<sparseSize; ++i)
// {
// for (size_t j=0; j<sparseSize; ++j)
// {
// if (((rand()%100)/100.0) <= sparsePerc)
// {
// stl_A[i*sparseSize + j] = float(rand());
// eigen_A.insert(i,j) = stl_A[i*sparseSize + j];
// }
// }
// }
// // Perform the matrix*matrix product
// // On CPU
// Eigen::SparseMatrix<float> eigen_C(sparseSize, sparseSize);
// timer.start();
// eigen_C = (eigen_A * eigen_A).pruned();
// exec_time = timer.get();
// resultsFile << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
// std::cout << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
// // on GPU
// timer.start();
// // Copy to gpu memory
// // Using fastcopy I can get ~500x speed improvement
// viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
// viennacl::matrix<ScalarType> vcl_C(sparseSize, sparseSize);
// vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
// viennacl::backend::finish();
// exec_time = timer.get();
// resultsFile << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
// std::cout << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
// }
//}
resultsFile.close();
//
// That's it.
//
std::cout << "Press [ENTER] to exit " << std::endl;
std::cin.get();
return EXIT_SUCCESS;
}
//禁用调试机制以与ublas进行公平比较:
#ifndef NDEBUG
#定义NDEBUG
#恩迪夫
//
//包括必要的系统标题
//
#包括
#包括
//
//ublas包括
//
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义特征值是我知道稀疏模还不稳定
//
//本征包括
//
#包括
#包括
//如果要在ublas对象上使用ViennaCL算法,则必须设置
#用UBLAS 1定义VIENNACL_
//
//重要提示:如果要在特征对象上使用ViennaCL算法,必须在任何ViennaCL包含之前设置
//
#用特征值1定义VIENNACL_
//
//重要提示:如果要使用OPENCL使用ViennaCL算法,必须在任何ViennaCL包含之前设置
//
#ifndef VIENNACL_和OPENCL
#用OPENCL定义VIENNACL_
#恩迪夫
//
//维也纳包括
//
#包括“viennacl/scalar.hpp”
#包括“viennacl/vector.hpp”
#包括“viennacl/matrix.hpp”
#包括“viennacl/linalg/prod.hpp”
#包括“viennacl/compressed_matrix.hpp”
#包括“viennacl/linalg/sparse_matrix_operations.hpp”
#包括“viennacl/linalg/ilu.hpp”
#包括“viennacl/linalg/detail/ilu/block_ilu.hpp”
#包括“viennacl/linalg/direct_solve.hpp”
#包括“viennacl/linalg/Biggstab.hpp”
//本教程的一些辅助函数:
#包括“示例/教程/Random.hpp”
#包括“示例/教程/向量io.hpp”
#包括“示例/基准/基准utils.hpp”
#定义BLAS3_矩阵_大小700
使用名称空间boost::numeric;
#ifndef VIENNACL_和OPENCL
结构虚拟
{
std::size\u t size()常量{return 1;}
};
#恩迪夫
int main()
{
typedef float ScalarType;
定时器;
双执行时间;
//
//在上下文中初始化OpenCL设备
//
std::cout我做了一次比较,比较了Eigen和ViennaCL(都在调试中):
使用的代码是:
//disable debug mechanisms to have a fair comparison with ublas:
#ifndef NDEBUG
#define NDEBUG
#endif
//
// include necessary system headers
//
#include <iostream>
#include <fstream>
//
// ublas includes
//
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>
#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
//
// Eigen includes
//
#include <Eigen/Core>
#include <Eigen/Dense>
// Must be set if you want to use ViennaCL algorithms on ublas objects
#define VIENNACL_WITH_UBLAS 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
//
#define VIENNACL_WITH_EIGEN 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms using OPENCL
//
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif
//
// ViennaCL includes
//
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/compressed_matrix.hpp"
#include "viennacl/linalg/sparse_matrix_operations.hpp"
#include "viennacl/linalg/ilu.hpp"
#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/bicgstab.hpp"
// Some helper functions for this tutorial:
#include "examples/tutorial/Random.hpp"
#include "examples/tutorial/vector-io.hpp"
#include "examples/benchmarks/benchmark-utils.hpp"
#define BLAS3_MATRIX_SIZE 700
using namespace boost::numeric;
#ifndef VIENNACL_WITH_OPENCL
struct dummy
{
std::size_t size() const { return 1; }
};
#endif
int main()
{
typedef float ScalarType;
Timer timer;
double exec_time;
//
// Initialize OpenCL device in the context
//
std::cout << std::endl << "--- initialized OpenGL device using ViennaCL ---" << std::endl;
#ifdef VIENNACL_WITH_OPENCL
std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
dummy devices;
#endif
#ifdef VIENNACL_WITH_OPENCL
viennacl::ocl::current_context().switch_device(devices[0]);
std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif
//// Output results file
std::ofstream resultsFile;
resultsFile.open("resultsFile.txt");
resultsFile << "Processing Unit,Mat Size,Exec time" << std::endl;
std::cout << "Processing Unit,Mat Size,Exec time" << std::endl;
// Start defining the dense matrices
size_t points = 230000;
size_t transRows=4;
size_t transCols=3;
// Other alternative: Use Eigen
Eigen::MatrixXf eigen_A(points, transRows);
Eigen::MatrixXf eigen_B(transRows, transCols);
// One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
std::vector<ScalarType> stl_A(points * transRows);
std::vector<ScalarType> stl_B(transRows * transCols);
// Set up the ViennaCL object
viennacl::matrix<ScalarType> vcl_A(points, transRows);
viennacl::matrix<ScalarType> vcl_B(transRows, transCols);
// Fill dense matrix in normal memory
for (unsigned int i = 0; i < points; ++i)
{
for (unsigned int j = 0; j < transRows; ++j)
{
stl_A[i*transRows + j] = random<ScalarType>();
eigen_A(i,j) = stl_A[i*transRows + j];
}
}
for (unsigned int i = 0; i < transRows; ++i)
{
for (unsigned int j = 0; j < transCols; ++j)
{
stl_B[i*transCols + j] = random<ScalarType>();
eigen_B(i,j) = stl_A[i*transCols + j];
}
}
// Perform the matrix*matrix product
// On CPU
Eigen::MatrixXf eigen_C(points, transCols);
timer.start();
eigen_C = eigen_A * eigen_B;
exec_time = timer.get();
resultsFile << "CPU," << points << "," << exec_time << std::endl;
std::cout << "CPU," << points << "," << exec_time << std::endl;
// on GPU
timer.start();
// Copy to gpu memory
// Using fastcopy I can get ~500x speed improvement
viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
viennacl::fast_copy(&(stl_B[0]),&(stl_B[0]) + stl_B.size(),vcl_B);
viennacl::matrix<ScalarType> vcl_C(points, transCols);
vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
viennacl::backend::finish();
exec_time = timer.get();
resultsFile << "GPU," << points << "," << exec_time << std::endl;
std::cout << "GPU," << points << "," << exec_time << std::endl;
//// Start defining the dense matrices
//for(size_t denseSize=10; denseSize<=1000; denseSize=denseSize*10)
//{
// // Other alternative: Use Eigen
// Eigen::MatrixXf eigen_A(denseSize, denseSize);
// // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
// std::vector<ScalarType> stl_A(denseSize * denseSize);
// // Set up the ViennaCL object
// viennacl::matrix<ScalarType> vcl_A(denseSize, denseSize);
// // Fill dense matrix in normal memory
// for (unsigned int i = 0; i < denseSize; ++i)
// {
// for (unsigned int j = 0; j < denseSize; ++j)
// {
// stl_A[i*denseSize + j] = random<ScalarType>();
// eigen_A(i,j) = stl_A[i*denseSize + j];
// }
// }
// // Perform the matrix*matrix product
// // On CPU
// Eigen::MatrixXf eigen_C(denseSize, denseSize);
// timer.start();
// eigen_C = eigen_A * eigen_A;
// exec_time = timer.get();
// resultsFile << "CPU," << denseSize << "," << exec_time << std::endl;
// std::cout << "CPU," << denseSize << "," << exec_time << std::endl;
// // on GPU
// timer.start();
// // Copy to gpu memory
// // Using fastcopy I can get ~500x speed improvement
// viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
// viennacl::matrix<ScalarType> vcl_C(denseSize, denseSize);
// vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
// viennacl::backend::finish();
// exec_time = timer.get();
// resultsFile << "GPU," << denseSize << "," << exec_time << std::endl;
// std::cout << "GPU," << denseSize << "," << exec_time << std::endl;
//}
//// Start defining the sparse matrices
//for(size_t sparseSize=10; sparseSize<=1000; sparseSize=sparseSize*10)
//{
// for(float sparsePerc=0.25; sparsePerc<=1.0; sparsePerc=2*sparsePerc)
// {
// // Other alternative: Use Eigen
// Eigen::SparseMatrix<float> eigen_A(sparseSize, sparseSize);
// // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
// std::vector<ScalarType> stl_A(sparseSize * sparseSize);
// // Set up the ViennaCL sparse matrix
// viennacl::matrix<ScalarType> vcl_A(sparseSize, sparseSize);
// // Fill dense matrix in normal memory
// for (size_t i=0; i<sparseSize; ++i)
// {
// for (size_t j=0; j<sparseSize; ++j)
// {
// if (((rand()%100)/100.0) <= sparsePerc)
// {
// stl_A[i*sparseSize + j] = float(rand());
// eigen_A.insert(i,j) = stl_A[i*sparseSize + j];
// }
// }
// }
// // Perform the matrix*matrix product
// // On CPU
// Eigen::SparseMatrix<float> eigen_C(sparseSize, sparseSize);
// timer.start();
// eigen_C = (eigen_A * eigen_A).pruned();
// exec_time = timer.get();
// resultsFile << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
// std::cout << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
// // on GPU
// timer.start();
// // Copy to gpu memory
// // Using fastcopy I can get ~500x speed improvement
// viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
// viennacl::matrix<ScalarType> vcl_C(sparseSize, sparseSize);
// vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
// viennacl::backend::finish();
// exec_time = timer.get();
// resultsFile << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
// std::cout << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
// }
//}
resultsFile.close();
//
// That's it.
//
std::cout << "Press [ENTER] to exit " << std::endl;
std::cin.get();
return EXIT_SUCCESS;
}
//禁用调试机制以与ublas进行公平比较:
#ifndef NDEBUG
#定义NDEBUG
#恩迪夫
//
//包括必要的系统标题
//
#包括
#包括
//
//ublas包括
//
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义特征值是我知道稀疏模还不稳定
//
//本征包括
//
#包括
#包括
//如果要在ublas对象上使用ViennaCL算法,则必须设置
#用UBLAS 1定义VIENNACL_
//
//重要提示:如果要在特征对象上使用ViennaCL算法,必须在任何ViennaCL包含之前设置
//
#用特征值1定义VIENNACL_
//
//重要提示:如果要使用OPENCL使用ViennaCL算法,必须在任何ViennaCL包含之前设置
//
#ifndef VIENNACL_和OPENCL
#用OPENCL定义VIENNACL_
#恩迪夫
//
//维也纳包括
//
#包括“viennacl/scalar.hpp”
#包括“viennacl/vector.hpp”
#包括“viennacl/matrix.hpp”
#包括“viennacl/linalg/prod.hpp”
#包括“viennacl/compressed_matrix.hpp”
#包括“viennacl/linalg/sparse_matrix_operations.hpp”
#包括“viennacl/linalg/ilu.hpp”
#包括“viennacl/linalg/detail/ilu/block_ilu.hpp”
#包括“viennacl/linalg/direct_solve.hpp”
#包括“viennacl/linalg/Biggstab.hpp”
//本教程的一些辅助函数:
#包括“示例/教程/Random.hpp”
#包括“示例/教程/向量io.hpp”
#包括“示例/基准/基准utils.hpp”
#定义BLAS3_矩阵_大小700
使用名称空间boost::numeric;
#ifndef VIENNACL_和OPENCL
结构虚拟
{
std::size\u t size()常量{return 1;}
};
#恩迪夫
int main()
{
typedef float ScalarType;
定时器;
双执行时间;
//
//在上下文中初始化OpenCL设备
//
std::cout Re other LIB:可能有些多线程BLA也有稀疏矩阵实现。OpenCL替代方案:您可能想试试速度非常快、使用非常方便的方法。使用分析来找出瓶颈所在。不要猜测。您提到矩阵向量产品:确保向量确实是vector在编译时,如VectorXd
或MatrixXd
@NeilKirk的行/列,我在这里分析了这个问题:Re-other-libs:可能有些多线程BLA也有稀疏的矩阵实现。OpenCL替代方案:您可能想尝试一下速度非常快、使用相当简单的方法。使用分析来找出瓶颈在哪里。不要猜测。你提到矩阵向量积:确保向量在编译时确实是一个向量,比如VectorXd
或MatrixXd
的行/列@NeilKirk我在这里分析了这个问题:我实际上启用了它们(-fopenmp-msse2)顺便说一下,注释和分解函数,我发现了我的一个函数,它把一个矩阵作为一个AGECT并返回另一个函数,使用C++函数“Erf(double)”(元素),通过双“to”循环。,是否需要很长的计算时间,有没有想过如何加快计算速度?@Naucle提出一个有问题代码的问题,并尝试解决(和profi)