C++ 在c+中添加两个矩阵+；以两种不同的方式_C++

C++ 在c+中添加两个矩阵+；以两种不同的方式

c++

C++ 在c+中添加两个矩阵+；以两种不同的方式,c++,C++,我只是想知道这两种情况之间是否存在差异第一个代码是外部用于计算行数，内部用于计算列数第二个代码是外部用于计算列数，内部用于计算行数我应用这两种方法并得到相同的结果您将得到相同的结果，但可能会得到不同的性能矩阵最有可能以行主顺序存储并按行访问，这样可能会获得更好的内存带宽和美元利用率。试着对巨大的矩阵做同样的操作，然后测量walltime 以下是计时方法。首先，我的结果。报告的数字是10000次试验的平均CPU时钟周期 $ clang++ -Ofast -DNDEBUG -std=c++1

我只是想知道这两种情况之间是否存在差异

第一个代码是外部用于计算行数，内部用于计算列数

第二个代码是外部用于计算列数，内部用于计算行数

我应用这两种方法并得到相同的结果

您将得到相同的结果，但可能会得到不同的性能

矩阵最有可能以行主顺序存储并按行访问，这样可能会获得更好的内存带宽和美元利用率。试着对巨大的矩阵做同样的操作，然后测量walltime

以下是计时方法。首先，我的结果。报告的数字是10000次试验的平均CPU时钟周期

$ clang++ -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided:    60149
sequential: 27137
$ g++-4.9 -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided:    90517
sequential: 33407

顺序存取速度更快。原因是缓存行为，特别是缓存线。这是一篇关于这个主题的有趣文章

因为行和列是任意的，所以我区分了跨步和顺序，而不是行和列。通常在C++中，我们认为顺序元素位于同一行，但这纯粹是约定，而不是语言固有的。不同的库遵循不同的约定

测试代码

// timing
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
#include <stdlib.h>
uint64_t start, stop;
unsigned cycles_high;
unsigned cycles_low;
unsigned cycles_high1;
unsigned cycles_low1;
unsigned ellapsed_cycles;

static inline void start_count()
{
    asm volatile(
        "CPUID\n\t"
        "RDTSC\n\t"
        "mov %%edx, %0\n\t"
        "mov %%eax, %1\n\t"
        : "=r" (cycles_high), "=r" (cycles_low)
        :
        : "%rax", "%rbx", "%rcx", "%rdx");
}

static inline void stop_count()
{
    asm volatile(
        "RDTSCP\n\t"
        "mov %%edx, %0\n\t"
        "mov %%eax, %1\n\t"
        "CPUID\n\t"
        : "=r" (cycles_high1), "=r" (cycles_low1)
        :
        : "%rax", "%rbx", "%rcx", "%rdx");
    start = ( ((uint64_t)cycles_high << 32) | cycles_low );
    stop = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
    ellapsed_cycles = stop - start;
}


// matrix addition
#include <cstddef>
#include <memory>
#include <vector>
#include <iostream>
#include <cassert>
#include <random>

using std::size_t;

template<class T>
class Matrix
{
public:
    Matrix(const size_t n, const size_t m)
    : elems_(new T[n*m]), n_(n), m_(m)
    {}

    Matrix(const size_t n, const size_t m, const std::vector< std::vector<T> >& elems)
    : elems_(new T[n*m]), n_(n), m_(m)
    {
        assert(n != 0 && m != 0);
        for (size_t i = 0; i != n_; ++i)
        {
            for (size_t j = 0; j != m_; ++j)
            {
                std::cout << "elems[" << n << ", " << j << "] = " << elems[i][j] << std::endl;
                elems_[i*n + j] = elems[i][j];
            }
        }
    }

    ~Matrix()
    {
        delete[] elems_;
    };

    T& operator()(const size_t i, size_t j)
    {
        assert(i < n_ && j < m_);
        return elems_[i*m_ + j];
    }

    const T& operator()(const size_t i, size_t j) const
    {
        assert(i < n_ && j < m_);
        return elems_[i*m_ + j];
    }

    friend std::ostream& operator<<(std::ostream& os, const Matrix& mat)
    {
        size_t i = 0;
        size_t j = 0;
        os << "[ ";
        goto first;
        for (; i != mat.n_; ++i)
        {
            os << "  ";
            first:
            for (j = 0; j != mat.m_; ++j)
            {
                // make it all pretty and nice
                os << std::fixed;
                os.width(6);
                os.precision(2);
                os << mat(i, j) << " ";
            }
            if (i != mat.n_ - 1)
                os << "\n";
        }
        os << " ]";
        return os;
    }

    size_t n() const { return n_; }
    size_t m() const { return m_; }

private:
    T* elems_;
    const size_t n_;
    const size_t m_;
};


template<class T>
Matrix<T> add_sequential(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
    assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
    const size_t n = mat1.n();
    const size_t m = mat1.m();
    Matrix<T> sum(n, m);
    for (size_t i = 0; i != n; ++i)
    {
        for (size_t j = 0; j != m; ++j)
        {
            sum(i, j) = mat1(i, j) + mat2(i, j);
        }
    }
    return sum;
}

template<class T>
Matrix<T> add_strided(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
    assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
    const size_t n = mat1.n();
    const size_t m = mat1.m();
    Matrix<T> sum(n, m);
    for (size_t j = 0; j != m; ++j)
    {
        for (size_t i = 0; i != n; ++i)
        {
            sum(i, j) = mat1(i, j) + mat2(i, j);
        }
    }
    return sum;
}


// misc: making random matrices, flushing cache, running timing tests
template<class T>
Matrix<T> rand_real_mat(const size_t n, const size_t m)
{
    static std::default_random_engine gen;
    static std::uniform_real_distribution<T> dis(-100.0, 100.0);
    Matrix<T> mat(n, m);
    for (size_t j = 0; j != m; ++j)
    {
        for (size_t i = 0; i != n; ++i)
        {
            mat(i, j) = dis(gen);
        }
    }
    return mat;
}


#include <fstream>
void flush_cache()
{
    std::ifstream rand("/dev/random", std::ifstream::binary);
    std::ofstream devnull("/dev/null", std::ofstream::binary);
    for (size_t i = 0; i != (30 * 1024 / sizeof(int)); ++i)
    {
        int r;
        rand >> r;
        devnull << r;
    }
}

template<class R, class ElemType>
static inline std::vector<uint64_t> time_mat_fnc(
    R (fnc)(const Matrix<ElemType>&, const Matrix<ElemType>&),
    const size_t n_times)
{
    std::ofstream devnull("/dev/null", std::ofstream::binary);

    std::vector<uint64_t> times;
    times.reserve(n_times);
    static const size_t n = 100;
    static const size_t m = 100;
    for (size_t i = 0; i != 1000; ++i)
    {
        // create 2 random n x m matrices
        const auto m1 = rand_real_mat<ElemType>(n, m);
        const auto m2 = rand_real_mat<ElemType>(n, m);

        flush_cache();

        // addition
        start_count();
        const auto sum = fnc(m1, m2);
        stop_count();
        times.push_back(ellapsed_cycles);

        // prevent optimizing away unused result
        devnull << sum;
    }
    return times;
}

template<typename C>
decltype(std::declval<typename C::value_type>()/std::declval<typename C::value_type>())
average(const C& cntnr)
{
    typename C::value_type sum = 0;
    size_t size = 0;
    for (typename C::const_iterator it = cntnr.begin(), end = cntnr.end(); it != end; ++it)
    {
        sum += *it;
        ++size;
    }
    return sum / size;
}

int main()
{
    typedef double ElemType;
    const size_t trials = 10000;
    const std::vector<uint64_t> strided_times = time_mat_fnc(add_strided<ElemType>, trials);
    const std::vector<uint64_t> sequential_times = time_mat_fnc(add_sequential<ElemType>, trials);

    const auto strided_avg = average(strided_times);
    const auto sequential_avg = average(sequential_times);

    std::cout << "avg. cycles 100x100 doubles matrix addition" << std::endl;
    std::cout << "strided:    " << strided_avg << std::endl;
    std::cout << "sequential: " << sequential_avg << std::endl;
}

//定时
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
#包括
uint64\u t启动、停止；
无符号周期_高；
无符号周期低；
无符号循环\u高1；
无符号循环_low1；
无符号ellapsed_循环；
静态内联无效开始计数（）
{
挥发性物质(
“CPUID\n\t”
“RDTSC\n\t”
mov%%edx，%0\n\t
mov%%eax，%1\n\t
：“=r”（周期高），“=r”（周期低）
:
：“%rax”、“rbx”、“rcx”、“rdx”）；
}
静态内联无效停止计数（）
{
挥发性物质(
“RDTSCP\n\t”
mov%%edx，%0\n\t
mov%%eax，%1\n\t
“CPUID\n\t”
：“=r”（周期高1），“=r”（周期低1）
:
：“%rax”、“rbx”、“rcx”、“rdx”）；
start=（（uint64）cycles\u high我认为这无关紧要，在这两种情况下，我们都有随机访问，而且我们仍然迭代每个n^2元素。我不确定我是否理解其中的区别。@FatihBAKIR这不是随机访问，而是顺序与跨步。你的问题是什么？