C++ 在c+中添加两个矩阵+;以两种不同的方式

C++ 在c+中添加两个矩阵+;以两种不同的方式,c++,C++,我只是想知道这两种情况之间是否存在差异 第一个代码是外部用于计算行数,内部用于计算列数 第二个代码是外部用于计算列数,内部用于计算行数 我应用这两种方法并得到相同的结果您将得到相同的结果,但可能会得到不同的性能 矩阵最有可能以行主顺序存储并按行访问,这样可能会获得更好的内存带宽和美元利用率。试着对巨大的矩阵做同样的操作,然后测量walltime 以下是计时方法。首先,我的结果。报告的数字是10000次试验的平均CPU时钟周期 $ clang++ -Ofast -DNDEBUG -std=c++1

我只是想知道这两种情况之间是否存在差异

第一个代码是外部用于计算行数,内部用于计算列数

第二个代码是外部用于计算列数,内部用于计算行数


我应用这两种方法并得到相同的结果

您将得到相同的结果,但可能会得到不同的性能

矩阵最有可能以行主顺序存储并按行访问,这样可能会获得更好的内存带宽和美元利用率。试着对巨大的矩阵做同样的操作,然后测量walltime


以下是计时方法。首先,我的结果。报告的数字是10000次试验的平均CPU时钟周期

$ clang++ -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided:    60149
sequential: 27137
$ g++-4.9 -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided:    90517
sequential: 33407
顺序存取速度更快。原因是缓存行为,特别是缓存线。这是一篇关于这个主题的有趣文章

因为行和列是任意的,所以我区分了跨步和顺序,而不是行和列。通常在C++中,我们认为顺序元素位于同一行,但这纯粹是约定,而不是语言固有的。不同的库遵循不同的约定

测试代码

// timing
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
#include <stdlib.h>
uint64_t start, stop;
unsigned cycles_high;
unsigned cycles_low;
unsigned cycles_high1;
unsigned cycles_low1;
unsigned ellapsed_cycles;

static inline void start_count()
{
    asm volatile(
        "CPUID\n\t"
        "RDTSC\n\t"
        "mov %%edx, %0\n\t"
        "mov %%eax, %1\n\t"
        : "=r" (cycles_high), "=r" (cycles_low)
        :
        : "%rax", "%rbx", "%rcx", "%rdx");
}

static inline void stop_count()
{
    asm volatile(
        "RDTSCP\n\t"
        "mov %%edx, %0\n\t"
        "mov %%eax, %1\n\t"
        "CPUID\n\t"
        : "=r" (cycles_high1), "=r" (cycles_low1)
        :
        : "%rax", "%rbx", "%rcx", "%rdx");
    start = ( ((uint64_t)cycles_high << 32) | cycles_low );
    stop = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
    ellapsed_cycles = stop - start;
}


// matrix addition
#include <cstddef>
#include <memory>
#include <vector>
#include <iostream>
#include <cassert>
#include <random>

using std::size_t;

template<class T>
class Matrix
{
public:
    Matrix(const size_t n, const size_t m)
    : elems_(new T[n*m]), n_(n), m_(m)
    {}

    Matrix(const size_t n, const size_t m, const std::vector< std::vector<T> >& elems)
    : elems_(new T[n*m]), n_(n), m_(m)
    {
        assert(n != 0 && m != 0);
        for (size_t i = 0; i != n_; ++i)
        {
            for (size_t j = 0; j != m_; ++j)
            {
                std::cout << "elems[" << n << ", " << j << "] = " << elems[i][j] << std::endl;
                elems_[i*n + j] = elems[i][j];
            }
        }
    }

    ~Matrix()
    {
        delete[] elems_;
    };

    T& operator()(const size_t i, size_t j)
    {
        assert(i < n_ && j < m_);
        return elems_[i*m_ + j];
    }

    const T& operator()(const size_t i, size_t j) const
    {
        assert(i < n_ && j < m_);
        return elems_[i*m_ + j];
    }

    friend std::ostream& operator<<(std::ostream& os, const Matrix& mat)
    {
        size_t i = 0;
        size_t j = 0;
        os << "[ ";
        goto first;
        for (; i != mat.n_; ++i)
        {
            os << "  ";
            first:
            for (j = 0; j != mat.m_; ++j)
            {
                // make it all pretty and nice
                os << std::fixed;
                os.width(6);
                os.precision(2);
                os << mat(i, j) << " ";
            }
            if (i != mat.n_ - 1)
                os << "\n";
        }
        os << " ]";
        return os;
    }

    size_t n() const { return n_; }
    size_t m() const { return m_; }

private:
    T* elems_;
    const size_t n_;
    const size_t m_;
};


template<class T>
Matrix<T> add_sequential(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
    assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
    const size_t n = mat1.n();
    const size_t m = mat1.m();
    Matrix<T> sum(n, m);
    for (size_t i = 0; i != n; ++i)
    {
        for (size_t j = 0; j != m; ++j)
        {
            sum(i, j) = mat1(i, j) + mat2(i, j);
        }
    }
    return sum;
}

template<class T>
Matrix<T> add_strided(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
    assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
    const size_t n = mat1.n();
    const size_t m = mat1.m();
    Matrix<T> sum(n, m);
    for (size_t j = 0; j != m; ++j)
    {
        for (size_t i = 0; i != n; ++i)
        {
            sum(i, j) = mat1(i, j) + mat2(i, j);
        }
    }
    return sum;
}


// misc: making random matrices, flushing cache, running timing tests
template<class T>
Matrix<T> rand_real_mat(const size_t n, const size_t m)
{
    static std::default_random_engine gen;
    static std::uniform_real_distribution<T> dis(-100.0, 100.0);
    Matrix<T> mat(n, m);
    for (size_t j = 0; j != m; ++j)
    {
        for (size_t i = 0; i != n; ++i)
        {
            mat(i, j) = dis(gen);
        }
    }
    return mat;
}


#include <fstream>
void flush_cache()
{
    std::ifstream rand("/dev/random", std::ifstream::binary);
    std::ofstream devnull("/dev/null", std::ofstream::binary);
    for (size_t i = 0; i != (30 * 1024 / sizeof(int)); ++i)
    {
        int r;
        rand >> r;
        devnull << r;
    }
}

template<class R, class ElemType>
static inline std::vector<uint64_t> time_mat_fnc(
    R (fnc)(const Matrix<ElemType>&, const Matrix<ElemType>&),
    const size_t n_times)
{
    std::ofstream devnull("/dev/null", std::ofstream::binary);

    std::vector<uint64_t> times;
    times.reserve(n_times);
    static const size_t n = 100;
    static const size_t m = 100;
    for (size_t i = 0; i != 1000; ++i)
    {
        // create 2 random n x m matrices
        const auto m1 = rand_real_mat<ElemType>(n, m);
        const auto m2 = rand_real_mat<ElemType>(n, m);

        flush_cache();

        // addition
        start_count();
        const auto sum = fnc(m1, m2);
        stop_count();
        times.push_back(ellapsed_cycles);

        // prevent optimizing away unused result
        devnull << sum;
    }
    return times;
}

template<typename C>
decltype(std::declval<typename C::value_type>()/std::declval<typename C::value_type>())
average(const C& cntnr)
{
    typename C::value_type sum = 0;
    size_t size = 0;
    for (typename C::const_iterator it = cntnr.begin(), end = cntnr.end(); it != end; ++it)
    {
        sum += *it;
        ++size;
    }
    return sum / size;
}

int main()
{
    typedef double ElemType;
    const size_t trials = 10000;
    const std::vector<uint64_t> strided_times = time_mat_fnc(add_strided<ElemType>, trials);
    const std::vector<uint64_t> sequential_times = time_mat_fnc(add_sequential<ElemType>, trials);

    const auto strided_avg = average(strided_times);
    const auto sequential_avg = average(sequential_times);

    std::cout << "avg. cycles 100x100 doubles matrix addition" << std::endl;
    std::cout << "strided:    " << strided_avg << std::endl;
    std::cout << "sequential: " << sequential_avg << std::endl;
}
//定时
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
#包括
uint64\u t启动、停止;
无符号周期_高;
无符号周期低;
无符号循环\u高1;
无符号循环_low1;
无符号ellapsed_循环;
静态内联无效开始计数()
{
挥发性物质(
“CPUID\n\t”
“RDTSC\n\t”
mov%%edx,%0\n\t
mov%%eax,%1\n\t
:“=r”(周期高),“=r”(周期低)
:
:“%rax”、“rbx”、“rcx”、“rdx”);
}
静态内联无效停止计数()
{
挥发性物质(
“RDTSCP\n\t”
mov%%edx,%0\n\t
mov%%eax,%1\n\t
“CPUID\n\t”
:“=r”(周期高1),“=r”(周期低1)
:
:“%rax”、“rbx”、“rcx”、“rdx”);

start=((uint64)cycles\u high我认为这无关紧要,在这两种情况下,我们都有随机访问,而且我们仍然迭代每个n^2元素。我不确定我是否理解其中的区别。@FatihBAKIR这不是随机访问,而是顺序与跨步。你的问题是什么?