C++ 软件光栅化实现加速思想_C++_Optimization_Rendering_Rasterizing

C++ 软件光栅化实现加速思想
c++ optimization
C++ 软件光栅化实现加速思想,c++,optimization,rendering,rasterizing,C++,Optimization,Rendering,Rasterizing,我正在编写三角形光栅化软件。在下面的示例中，我正在光栅化一个全屏三角形，但它仍然非常慢……在我的i7上，它以1024x768的速度为一个多边形以130 FPS的速度运行我可以看到一些优化的空间，但仍然没有接近我想要的性能。我遗漏了什么，有什么想法可以快速加速吗。。。在奔腾2上使用更多多边形进行软件渲染时，即使是Quake2也运行得更快：/ 谢谢你的建议下面是代码（我使用gmtl表示向量）浮动交叉（Vec2f常量和边1，Vec2f常量和边2） { 返回edge1[0]*edge2[1]-e
我正在编写三角形光栅化软件。在下面的示例中，我正在光栅化一个全屏三角形，但它仍然非常慢……在我的i7上，它以1024x768的速度为一个多边形以130 FPS的速度运行
我可以看到一些优化的空间，但仍然没有接近我想要的性能。我遗漏了什么，有什么想法可以快速加速吗。。。在奔腾2上使用更多多边形进行软件渲染时，即使是Quake2也运行得更快：/
谢谢你的建议
下面是代码（我使用gmtl表示向量）
浮动交叉（Vec2f常量和边1，Vec2f常量和边2）
{
返回edge1[0]*edge2[1]-edge1[1]*edge2[0]；
}
bool caculatebary中心权重（float invdoubletria、float doubletria、Vec2f const和edgesFromTo01_12_20）[3]、Vec2f const和顶点[3]、Vec2f const和point、float和overset[3]）
{
超过[0]=交叉（点-顶点[2]，边从01_12_20[1]）；
超过[1]=交叉（点-顶点[0]，边从01_12_20[2]）；
超过[2]=交叉（点-顶点[1]，边从01_12_20[0]）；
如果（大于[0]>=0.0f和大于[1]>=0.0f和大于[2]>=0.0f
&&超过[0]+超过[1]+超过[2]=一秒钟）
{
我相信我一定能以每秒130帧的速度跑完全程
…就连地震2也跑得更快了
我必须承认，我的灵感来自OpenGL——类似于OpenGL的一些部分（至少，我相信它是如何工作的），并且经过了一些简化
我还必须承认，我不明白caculateBarycentricWeights（）
函数到底有什么好处
但是，我的实施基于以下概念：
光栅化器必须在屏幕空间中工作
在光栅化之前，必须将3D顶点转换为屏幕空间
内部光栅循环必须尽可能简单–大多数内部循环中的分支（即if
和复杂函数）肯定会适得其反
因此，我创建了一个新的rasterize（）
函数。因此，我认为rasterize（）
必须填充水平屏幕线。为此，我按照3个三角形顶点的y
分量对其排序。（注意，顶点必须事先转换为屏幕空间。）在常规情况下，这允许将三角形水平切割为两部分：

位于水平基座上方的顶部
在水平底部下方具有峰值的下部

源代码如下所示，但草图可能有助于使所有插值更易于理解：

为了事先将VTC
转换为屏幕空间，我使用了两个矩阵：
const Mat4x4f matProj(InitScale, 1.0f / Width, 1.0f / Height, 1.0f);
const Mat4x4f matScreen
  = Mat4x4f(InitScale, 0.5f * Width, 0.5f * Height, 1.0f)
  * Mat4x4f(InitTrans, Vec3f(1.0f, 1.0f, 0.0f));
const Mat4x4f mat = matScreen * matProj /* * matView * matModel */;

在哪里

matProj
是缩放模型坐标以剪裁空间的投影矩阵
matScreen
负责将剪辑空间转换为屏幕空间

剪辑空间是可见部分在x、y和z方向的范围[-1,1]内的空间
屏幕空间有[0，宽度]和[0，高度）两个范围，其中我根据OPs要求使用了width=1024
和height=768

当我创建OpenGL程序时，我总是从一个蓝色的屏幕开始（因为蓝色是我最喜欢的透明颜色），这是（某种）令人讨厌的传统。这主要是由于混淆了一些矩阵运算，然后努力查找和修复这些运算。因此，我尝试将光栅化结果可视化，以确保我的基准测试不会过于热情，因为三角形的任何部分都在视图之外，因此会被剪掉：

为了实现这一点，我将光栅化器示例包装到一个简单的Qt应用程序中，其中FBO
数据存储在QImage
中，然后将其应用到QPixmap
中，并用QLabel
显示。一旦我得到了这个，我想最好添加一些简单的动画随着时间的推移，通过修改模型视图矩阵，我得到了以下示例：
#include <cstdint>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <vector>

#include "linmath.h"

typedef unsigned uint;
typedef std::uint32_t uint32;

const int Width = 1024, Height = 768;

class FBO {
  public:
    const int width, height;
  private:
    std::vector<uint32> _rgba; // pixel buffer
  public:
    FBO(int width, int height):
      width(width), height(height),
      _rgba(width * height, 0)
    { }
    ~FBO() = default;
    FBO(const FBO&) = delete;
    FBO& operator=(const FBO&) = delete;

    void clear(uint32 rgba) { std::fill(_rgba.begin(), _rgba.end(), rgba); }
    void set(int x, int y, uint32 rgba)
    {
      const size_t i = y * width + x;
      _rgba[i] = rgba;
    }
    size_t getI(int y) const { return y * width; }
    size_t getI(int x, int y) const { return y * width + x; }
    void set(size_t i, uint32 rgba) { _rgba[i] = rgba; }

    const std::vector<uint32>& getRGBA() const { return _rgba; }
};

void rasterize(FBO &fbo, const Vec3f vtcs[3], const uint32 rgba)
{
  // sort vertices by y coordinates
  uint iVtcs[3] = { 0, 1, 2 };
  if (vtcs[iVtcs[0]].y > vtcs[iVtcs[1]].y) std::swap(iVtcs[0], iVtcs[1]);
  if (vtcs[iVtcs[1]].y > vtcs[iVtcs[2]].y) std::swap(iVtcs[1], iVtcs[2]);
  if (vtcs[iVtcs[0]].y > vtcs[iVtcs[1]].y) std::swap(iVtcs[0], iVtcs[1]);
  const Vec3f vtcsS[3] = { vtcs[iVtcs[0]], vtcs[iVtcs[1]], vtcs[iVtcs[2]] };
  const float yM = vtcs[1].y;
  // cut triangle in upper and lower part
  const float xT = vtcsS[0].x;
  float xML = vtcsS[1].x;
  const float f = (yM - vtcsS[0].y) / (vtcsS[2].y - vtcsS[0].y);
  float xMR = (1.0f - f) * xT + f * vtcsS[2].x;
  if (xML > xMR) std::swap(xML, xMR);
  // draw upper part of triangle
  if (vtcs[iVtcs[0]].y < yM) {
    const float dY = yM - vtcsS[0].y;
    for (int y = std::max((int)vtcsS[0].y, 0),
      yE = std::min((int)(yM + 0.5f), fbo.height);
      y < yE; ++y) {
      const float f1 = (yM - y) / dY, f0 = 1.0f - f1;
      const float xL = f0 * xT + f1 * xML, xR = f0 * xT + f1 * xMR;
      const size_t i = fbo.getI(y);
      for (int x = std::max((int)xL, 0),
        xE = std::min((int)(xR + 0.5f), fbo.width);
        x < xE; ++x) {
        fbo.set(i + x, rgba);
      }
    }
  } // else upper edge horizontal
  // draw lower part of triangle
  if (yM < vtcs[2].y) {
    const float xB = vtcsS[2].x;
    const float dY = vtcsS[2].y - yM;
    for (int y = std::max((int)yM, 0),
      yE = std::min((int)(vtcsS[2].y + 0.5f), fbo.height);
      y < yE; ++y) {
      const float f1 = (y - yM) / dY, f0 = 1.0f - f1;
      const float xL = f0 * xML + f1 * xB, xR = f0 * xMR + f1 * xB;
      const size_t i = fbo.getI(y);
      for (int x = std::max((int)xL, 0),
        xE = std::min((int)(xR + 0.5f), fbo.width);
        x < xE; ++x) {
        fbo.set(i + x, rgba);
      }
    }
  } // else lower edge horizontal
}

template <typename VALUE>
Vec3T<VALUE> transformPoint(const Mat4x4T<VALUE> &mat, const Vec3T<VALUE> &pt)
{
  Vec4T<VALUE> pt_ = mat * Vec4T<VALUE>(pt.x, pt.y, pt.z, (VALUE)1);
  return pt_.w != (VALUE)0
    ? Vec3T<VALUE>(pt_.x / pt_.w, pt_.y / pt_.w, pt_.z / pt_.w)
    : Vec3T<VALUE>(pt_.x, pt_.y, pt_.z);
}

typedef std::chrono::high_resolution_clock HiResClock;
typedef std::chrono::microseconds MicroSecs;

int mainBench()
{
  const Mat4x4f matProj(InitScale, 1.0f / Width, 1.0f / Height, 1.0f);
  const Mat4x4f matScreen
    = Mat4x4f(InitScale, 0.5f * Width, 0.5f * Height, 1.0f)
    * Mat4x4f(InitTrans, Vec3f(1.0f, 1.0f, 0.0f));
  const Mat4x4f mat = matScreen * matProj /* * matView * matModel */;
  FBO fbo(Width, Height);

  HiResClock::time_point start = HiResClock::now();
  size_t fps = 0;

  for (;;) {
    static Vec3f v0 = { -2000.0f, -2000.0, 10.0 };
    static Vec3f v1 = { 2000.0f, -2000.0, 10.0 };
    static Vec3f v2 = { 0.0f, 2000.0, 10.0 };
    const Vec3f vtcs[] = {
      transformPoint(mat, v0), transformPoint(mat, v1), transformPoint(mat, v2)
    };
    rasterize(fbo, vtcs, 0xff0000ff);
    ++fps;

    HiResClock::time_point now = HiResClock::now();
    auto timeDiff
      = std::chrono::duration_cast<MicroSecs>(now - start).count();

    static const long oneSecond = 1000000;
    if (timeDiff >= oneSecond) {
      std::cout << fps << " " << std::flush;
      fps = 0;
      start = now;
    }
  }
}

#include <stack>

#include <QtWidgets>

struct RenderContext {
  FBO fbo; // frame buffer object
  Mat4x4f matModelView;
  Mat4x4f matProj;

  RenderContext(int width, int height):
    fbo(width, height),
    matModelView(InitIdent),
    matProj(InitIdent)
  { }
  ~RenderContext() = default;
};

void render(RenderContext &context)
{
  context.fbo.clear(0xffffcc88);
  static Vec3f v0 = { -2000.0f, -2000.0, 10.0 };
  static Vec3f v1 = { 2000.0f, -2000.0, 10.0 };
  static Vec3f v2 = { 0.0f, 2000.0, 10.0 };
#if 0 // test transformations of mainBench()
  const Mat4x4f matProj(InitScale, 1.0f / Width, 1.0f / Height, 1.0f);
  const Mat4x4f matScreen
    = Mat4x4f(InitScale, 0.5f * Width, 0.5f * Height, 1.0f)
    * Mat4x4f(InitTrans, Vec3f(1.0f, 1.0f, 0.0f));
  const Mat4x4f mat = matScreen * matProj /* * matView * matModel */;
#else // make a simple animation
  const Mat4x4f matScreen
    = Mat4x4f(InitScale, 0.5f * context.fbo.width, 0.5f * context.fbo.height, 1.0f)
    * Mat4x4f(InitTrans, Vec3f(1.0f, 1.0f, 0.0f));
  const Mat4x4f mat = matScreen * context.matProj * context.matModelView;
#endif // 0
  const Vec3f vtcs[] = {
    transformPoint(mat, v0), transformPoint(mat, v1), transformPoint(mat, v2)
  };
  rasterize(context.fbo, vtcs, 0xff0000ff);
}

int main(int argc, char **argv)
{
  // evaluate arguments
  const bool gui = argc > 1
    && (strcmp(argv[1], "gui") == 0 || strcmp(argv[1], "-gui") == 0);
  // without arguments: start benchmark
  if (!gui) return mainBench();
  // remove argument "gui"
  for (char **arg = argv + 1; *arg; ++arg) arg [0] = arg[1];
  // Qt Demo
  qDebug() << "Qt Version:" << QT_VERSION_STR;
  QApplication app(argc, argv);
  RenderContext context(Width, Height);
  // setup GUI
  QPixmap qPixmapImg(Width, Height);
  QLabel qLblImg;
  qLblImg.setWindowTitle("Software Rasterizer Demo");
  qLblImg.setPixmap(qPixmapImg);
  qLblImg.show();
  // Qt timer for periodic rendering/animation
  QTime qTime(0, 0);
  QTimer qTimer;
  qTimer.setInterval(50);
  // install signal handlers
  QObject::connect(&qTimer, &QTimer::timeout,
    [&]() {
      // simple animation
      const float r = 1.0f, t = 0.001f * qTime.elapsed();
      context.matModelView
        = Mat4x4f(InitTrans, Vec3f(r * sin(t), r * cos(t), 0.0f))
        * Mat4x4f(InitScale, 0.0001f, 0.0001f, 0.0001f);
      // rendering
      render(context);
      // transfer FBO to qLblImg
      const QImage qImg((uchar*)context.fbo.getRGBA().data(),
        Width, Height, QImage::Format_RGBA8888);
      qPixmapImg.convertFromImage(qImg);
      qLblImg.setPixmap(qPixmapImg);
    });
  // runtime loop
  qTime.start(); qTimer.start();
  return app.exec();
}

在添加Qt代码之前，我只使用以下代码编译：
$g++-std=c++11-o rasterizerSimple rasterizerSimple.cc

为了使用Qt进行编译，我编写了一个Qt项目rasterizerSimple.pro
：
SOURCES=rasterizerSimple.cc
QT+=widgets

在Windows 10上编译和测试：
$qmake-qt5光栅化器simple.pro
$make
美元/光栅化简单
2724 2921 3015 2781 2800 2805 2859 2783 2931 2871 2902 2983 2995 2882 2940 2878 3007 2993 3066 3067 3118 3144 2849 3084 3020 3005 3030 2991 2999 3065 2941 3123 3119 2905 3135 2938

CtrlC
我的笔记本电脑有一个Intel i7（当我让程序运行更长的时间时，会产生更清凉的噪音）
这看起来没那么糟糕——平均每秒3000个三角形（如果他的i7速度比我的慢不了多少的话，这比OP的130 FPS还要快）
可能是，FPS可以通过进一步的改进得到进一步的提升。例如，插值可以通过使用来完成，尽管它不出现在大多数内部循环中。要优化大多数内部循环，水平线的填充应该以更快的方式完成（尽管我不太确定如何实现）
要启动Qt可视化，必须使用gui
启动应用程序：
$。/rasterizerSimple gui



这个示例启发了我在github上找到的一个玩具项目。编译代码时是否启用了优化？是的，在版本中，设置为最大速度（/O2）关于工作代码的反馈请求应在处提出。您是否可以从光栅化中删除所有计算，并仅使用常量值填充缓冲区？这将建立性能基线（您肯定不会比这更快）。如果您可以查看程序集，这将有所帮助。您是否有可能
#include <cstdint>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <vector>

#include "linmath.h"

typedef unsigned uint;
typedef std::uint32_t uint32;

const int Width = 1024, Height = 768;

class FBO {
  public:
    const int width, height;
  private:
    std::vector<uint32> _rgba; // pixel buffer
  public:
    FBO(int width, int height):
      width(width), height(height),
      _rgba(width * height, 0)
    { }
    ~FBO() = default;
    FBO(const FBO&) = delete;
    FBO& operator=(const FBO&) = delete;

    void clear(uint32 rgba) { std::fill(_rgba.begin(), _rgba.end(), rgba); }
    void set(int x, int y, uint32 rgba)
    {
      const size_t i = y * width + x;
      _rgba[i] = rgba;
    }
    size_t getI(int y) const { return y * width; }
    size_t getI(int x, int y) const { return y * width + x; }
    void set(size_t i, uint32 rgba) { _rgba[i] = rgba; }

    const std::vector<uint32>& getRGBA() const { return _rgba; }
};

void rasterize(FBO &fbo, const Vec3f vtcs[3], const uint32 rgba)
{
  // sort vertices by y coordinates
  uint iVtcs[3] = { 0, 1, 2 };
  if (vtcs[iVtcs[0]].y > vtcs[iVtcs[1]].y) std::swap(iVtcs[0], iVtcs[1]);
  if (vtcs[iVtcs[1]].y > vtcs[iVtcs[2]].y) std::swap(iVtcs[1], iVtcs[2]);
  if (vtcs[iVtcs[0]].y > vtcs[iVtcs[1]].y) std::swap(iVtcs[0], iVtcs[1]);
  const Vec3f vtcsS[3] = { vtcs[iVtcs[0]], vtcs[iVtcs[1]], vtcs[iVtcs[2]] };
  const float yM = vtcs[1].y;
  // cut triangle in upper and lower part
  const float xT = vtcsS[0].x;
  float xML = vtcsS[1].x;
  const float f = (yM - vtcsS[0].y) / (vtcsS[2].y - vtcsS[0].y);
  float xMR = (1.0f - f) * xT + f * vtcsS[2].x;
  if (xML > xMR) std::swap(xML, xMR);
  // draw upper part of triangle
  if (vtcs[iVtcs[0]].y < yM) {
    const float dY = yM - vtcsS[0].y;
    for (int y = std::max((int)vtcsS[0].y, 0),
      yE = std::min((int)(yM + 0.5f), fbo.height);
      y < yE; ++y) {
      const float f1 = (yM - y) / dY, f0 = 1.0f - f1;
      const float xL = f0 * xT + f1 * xML, xR = f0 * xT + f1 * xMR;
      const size_t i = fbo.getI(y);
      for (int x = std::max((int)xL, 0),
        xE = std::min((int)(xR + 0.5f), fbo.width);
        x < xE; ++x) {
        fbo.set(i + x, rgba);
      }
    }
  } // else upper edge horizontal
  // draw lower part of triangle
  if (yM < vtcs[2].y) {
    const float xB = vtcsS[2].x;
    const float dY = vtcsS[2].y - yM;
    for (int y = std::max((int)yM, 0),
      yE = std::min((int)(vtcsS[2].y + 0.5f), fbo.height);
      y < yE; ++y) {
      const float f1 = (y - yM) / dY, f0 = 1.0f - f1;
      const float xL = f0 * xML + f1 * xB, xR = f0 * xMR + f1 * xB;
      const size_t i = fbo.getI(y);
      for (int x = std::max((int)xL, 0),
        xE = std::min((int)(xR + 0.5f), fbo.width);
        x < xE; ++x) {
        fbo.set(i + x, rgba);
      }
    }
  } // else lower edge horizontal
}

template <typename VALUE>
Vec3T<VALUE> transformPoint(const Mat4x4T<VALUE> &mat, const Vec3T<VALUE> &pt)
{
  Vec4T<VALUE> pt_ = mat * Vec4T<VALUE>(pt.x, pt.y, pt.z, (VALUE)1);
  return pt_.w != (VALUE)0
    ? Vec3T<VALUE>(pt_.x / pt_.w, pt_.y / pt_.w, pt_.z / pt_.w)
    : Vec3T<VALUE>(pt_.x, pt_.y, pt_.z);
}

typedef std::chrono::high_resolution_clock HiResClock;
typedef std::chrono::microseconds MicroSecs;

int mainBench()
{
  const Mat4x4f matProj(InitScale, 1.0f / Width, 1.0f / Height, 1.0f);
  const Mat4x4f matScreen
    = Mat4x4f(InitScale, 0.5f * Width, 0.5f * Height, 1.0f)
    * Mat4x4f(InitTrans, Vec3f(1.0f, 1.0f, 0.0f));
  const Mat4x4f mat = matScreen * matProj /* * matView * matModel */;
  FBO fbo(Width, Height);

  HiResClock::time_point start = HiResClock::now();
  size_t fps = 0;

  for (;;) {
    static Vec3f v0 = { -2000.0f, -2000.0, 10.0 };
    static Vec3f v1 = { 2000.0f, -2000.0, 10.0 };
    static Vec3f v2 = { 0.0f, 2000.0, 10.0 };
    const Vec3f vtcs[] = {
      transformPoint(mat, v0), transformPoint(mat, v1), transformPoint(mat, v2)
    };
    rasterize(fbo, vtcs, 0xff0000ff);
    ++fps;

    HiResClock::time_point now = HiResClock::now();
    auto timeDiff
      = std::chrono::duration_cast<MicroSecs>(now - start).count();

    static const long oneSecond = 1000000;
    if (timeDiff >= oneSecond) {
      std::cout << fps << " " << std::flush;
      fps = 0;
      start = now;
    }
  }
}

#include <stack>

#include <QtWidgets>

struct RenderContext {
  FBO fbo; // frame buffer object
  Mat4x4f matModelView;
  Mat4x4f matProj;

  RenderContext(int width, int height):
    fbo(width, height),
    matModelView(InitIdent),
    matProj(InitIdent)
  { }
  ~RenderContext() = default;
};

void render(RenderContext &context)
{
  context.fbo.clear(0xffffcc88);
  static Vec3f v0 = { -2000.0f, -2000.0, 10.0 };
  static Vec3f v1 = { 2000.0f, -2000.0, 10.0 };
  static Vec3f v2 = { 0.0f, 2000.0, 10.0 };
#if 0 // test transformations of mainBench()
  const Mat4x4f matProj(InitScale, 1.0f / Width, 1.0f / Height, 1.0f);
  const Mat4x4f matScreen
    = Mat4x4f(InitScale, 0.5f * Width, 0.5f * Height, 1.0f)
    * Mat4x4f(InitTrans, Vec3f(1.0f, 1.0f, 0.0f));
  const Mat4x4f mat = matScreen * matProj /* * matView * matModel */;
#else // make a simple animation
  const Mat4x4f matScreen
    = Mat4x4f(InitScale, 0.5f * context.fbo.width, 0.5f * context.fbo.height, 1.0f)
    * Mat4x4f(InitTrans, Vec3f(1.0f, 1.0f, 0.0f));
  const Mat4x4f mat = matScreen * context.matProj * context.matModelView;
#endif // 0
  const Vec3f vtcs[] = {
    transformPoint(mat, v0), transformPoint(mat, v1), transformPoint(mat, v2)
  };
  rasterize(context.fbo, vtcs, 0xff0000ff);
}

int main(int argc, char **argv)
{
  // evaluate arguments
  const bool gui = argc > 1
    && (strcmp(argv[1], "gui") == 0 || strcmp(argv[1], "-gui") == 0);
  // without arguments: start benchmark
  if (!gui) return mainBench();
  // remove argument "gui"
  for (char **arg = argv + 1; *arg; ++arg) arg [0] = arg[1];
  // Qt Demo
  qDebug() << "Qt Version:" << QT_VERSION_STR;
  QApplication app(argc, argv);
  RenderContext context(Width, Height);
  // setup GUI
  QPixmap qPixmapImg(Width, Height);
  QLabel qLblImg;
  qLblImg.setWindowTitle("Software Rasterizer Demo");
  qLblImg.setPixmap(qPixmapImg);
  qLblImg.show();
  // Qt timer for periodic rendering/animation
  QTime qTime(0, 0);
  QTimer qTimer;
  qTimer.setInterval(50);
  // install signal handlers
  QObject::connect(&qTimer, &QTimer::timeout,
    [&]() {
      // simple animation
      const float r = 1.0f, t = 0.001f * qTime.elapsed();
      context.matModelView
        = Mat4x4f(InitTrans, Vec3f(r * sin(t), r * cos(t), 0.0f))
        * Mat4x4f(InitScale, 0.0001f, 0.0001f, 0.0001f);
      // rendering
      render(context);
      // transfer FBO to qLblImg
      const QImage qImg((uchar*)context.fbo.getRGBA().data(),
        Width, Height, QImage::Format_RGBA8888);
      qPixmapImg.convertFromImage(qImg);
      qLblImg.setPixmap(qPixmapImg);
    });
  // runtime loop
  qTime.start(); qTimer.start();
  return app.exec();
}

#ifndef LIN_MATH_H
#define LIN_MATH_H

#include <iostream>
#include <cassert>
#include <cmath>

double Pi = 3.1415926535897932384626433832795;

template <typename VALUE>
inline VALUE degToRad(VALUE angle)
{
  return (VALUE)Pi * angle / (VALUE)180;
}

template <typename VALUE>
inline VALUE radToDeg(VALUE angle)
{
  return (VALUE)180 * angle / (VALUE)Pi;
}

template <typename VALUE>
struct Vec2T {
  VALUE x, y;
  Vec2T() { }
  Vec2T(VALUE x, VALUE y): x(x), y(y) { }
};
template <typename VALUE>
VALUE length(const Vec2T<VALUE> &vec)
{
  return sqrt(vec.x * vec.x + vec.y * vec.y);
}
template <typename VALUE>
std::ostream& operator<<(std::ostream &out, const Vec2T<VALUE> &v)
{
  return out << "( " << v.x << ", " << v.y << " )";
}

typedef Vec2T<float> Vec2f;
typedef Vec2T<double> Vec2;

template <typename VALUE>
struct Vec3T {
  VALUE x, y, z;
  Vec3T() { }
  Vec3T(VALUE x, VALUE y, VALUE z): x(x), y(y), z(z) { }
  Vec3T(const Vec2T<VALUE> &xy, VALUE z): x(xy.x), y(xy.y), z(z) { }
  explicit operator Vec2T<VALUE>() const { return Vec2T<VALUE>(x, y); }
};
template <typename VALUE>
std::ostream& operator<<(std::ostream &out, const Vec3T<VALUE> &v)
{
  return out << "( " << v.x << ", " << v.y << ", " << v.z << " )";
}

typedef Vec3T<float> Vec3f;
typedef Vec3T<double> Vec3;

template <typename VALUE>
struct Vec4T {
  VALUE x, y, z, w;
  Vec4T() { }
  Vec4T(VALUE x, VALUE y, VALUE z, VALUE w): x(x), y(y), z(z), w(w) { }
  Vec4T(const Vec2T<VALUE> &xy, VALUE z, VALUE w):
    x(xy.x), y(xy.y), z(z), w(w)
  { }
  Vec4T(const Vec3T<VALUE> &xyz, VALUE w):
    x(xyz.x), y(xyz.y), z(xyz.z), w(w)
  { }
  explicit operator Vec2T<VALUE>() const { return Vec2T<VALUE>(x, y); }
  explicit operator Vec3T<VALUE>() const { return Vec3T<VALUE>(x, y, z); }
};
template <typename VALUE>
std::ostream& operator<<(std::ostream &out, const Vec4T<VALUE> &v)
{
  return out << "( " << v.x << ", " << v.y << ", " << v.z << ", " << v.w << " )";
}

typedef Vec4T<float> Vec4f;
typedef Vec4T<double> Vec4;

enum ArgInitIdent { InitIdent };
enum ArgInitTrans { InitTrans };
enum ArgInitRot { InitRot };
enum ArgInitRotX { InitRotX };
enum ArgInitRotY { InitRotY };
enum ArgInitRotZ { InitRotZ };
enum ArgInitScale { InitScale };

template <typename VALUE>
struct Mat4x4T {
  union {
    VALUE comp[4 * 4];
    struct {
      VALUE _00, _01, _02, _03;
      VALUE _10, _11, _12, _13;
      VALUE _20, _21, _22, _23;
      VALUE _30, _31, _32, _33;
    };
  };

  // constructor to build a matrix by elements
  Mat4x4T(
    VALUE _00, VALUE _01, VALUE _02, VALUE _03,
    VALUE _10, VALUE _11, VALUE _12, VALUE _13,
    VALUE _20, VALUE _21, VALUE _22, VALUE _23,
    VALUE _30, VALUE _31, VALUE _32, VALUE _33):
    _00(_00), _01(_01), _02(_02), _03(_03),
    _10(_10), _11(_11), _12(_12), _13(_13),
    _20(_20), _21(_21), _22(_22), _23(_23),
    _30(_30), _31(_31), _32(_32), _33(_33)
  { }
  // constructor to build an identity matrix
  Mat4x4T(ArgInitIdent):
    _00((VALUE)1), _01((VALUE)0), _02((VALUE)0), _03((VALUE)0),
    _10((VALUE)0), _11((VALUE)1), _12((VALUE)0), _13((VALUE)0),
    _20((VALUE)0), _21((VALUE)0), _22((VALUE)1), _23((VALUE)0),
    _30((VALUE)0), _31((VALUE)0), _32((VALUE)0), _33((VALUE)1)
  { }
  // constructor to build a matrix for translation
  Mat4x4T(ArgInitTrans, const Vec3T<VALUE> &t):
    _00((VALUE)1), _01((VALUE)0), _02((VALUE)0), _03((VALUE)t.x),
    _10((VALUE)0), _11((VALUE)1), _12((VALUE)0), _13((VALUE)t.y),
    _20((VALUE)0), _21((VALUE)0), _22((VALUE)1), _23((VALUE)t.z),
    _30((VALUE)0), _31((VALUE)0), _32((VALUE)0), _33((VALUE)1)
  { }
  // constructor to build a matrix for rotation about axis
  Mat4x4T(ArgInitRot, const Vec3T<VALUE> &axis, VALUE angle):
    _03((VALUE)0), _13((VALUE)0), _23((VALUE)0),
    _30((VALUE)0), _31((VALUE)0), _32((VALUE)0), _33((VALUE)1)
  {
    //axis.normalize();
    const VALUE sinAngle = sin(angle), cosAngle = cos(angle);
    const VALUE xx = axis.x * axis.x, xy = axis.x * axis.y;
    const VALUE xz = axis.x * axis.z, yy = axis.y * axis.y;
    const VALUE yz = axis.y * axis.z, zz = axis.z * axis.z;
    _00 = xx + cosAngle * ((VALUE)1 - xx) /* + sinAngle * 0 */;
    _01 = xy - cosAngle * xy - sinAngle * axis.z;
    _02 = xz - cosAngle * xz + sinAngle * axis.y;
    _10 = xy - cosAngle * xy + sinAngle * axis.z;
    _11 = yy + cosAngle * ((VALUE)1 - yy) /* + sinAngle * 0 */;
    _12 = yz - cosAngle * yz - sinAngle * axis.x;
    _20 = xz - cosAngle * xz - sinAngle * axis.y;
    _21 = yz - cosAngle * yz + sinAngle * axis.x;
    _22 = zz + cosAngle * ((VALUE)1 - zz) /* + sinAngle * 0 */;
  }
  // constructor to build a matrix for rotation about x axis
  Mat4x4T(ArgInitRotX, VALUE angle):
    _00((VALUE)1), _01((VALUE)0),    _02((VALUE)0),   _03((VALUE)0),
    _10((VALUE)0), _11(cos(angle)),  _12(-sin(angle)), _13((VALUE)0),
    _20((VALUE)0), _21(sin(angle)), _22(cos(angle)), _23((VALUE)0),
    _30((VALUE)0), _31((VALUE)0),    _32((VALUE)0),   _33((VALUE)1)
  { }
  // constructor to build a matrix for rotation about y axis
  Mat4x4T(ArgInitRotY, VALUE angle):
    _00(cos(angle)), _01((VALUE)0), _02(sin(angle)), _03((VALUE)0),
    _10((VALUE)0),   _11((VALUE)1), _12((VALUE)0),    _13((VALUE)0),
    _20(-sin(angle)), _21((VALUE)0), _22(cos(angle)),  _23((VALUE)0),
    _30((VALUE)0), _31((VALUE)0),    _32((VALUE)0),   _33((VALUE)1)
  { }
  // constructor to build a matrix for rotation about z axis
  Mat4x4T(ArgInitRotZ, VALUE angle):
    _00(cos(angle)),  _01(-sin(angle)), _02((VALUE)0), _03((VALUE)0),
    _10(sin(angle)), _11(cos(angle)), _12((VALUE)0), _13((VALUE)0),
    _20((VALUE)0),    _21((VALUE)0),   _22((VALUE)1), _23((VALUE)0),
    _30((VALUE)0),    _31((VALUE)0),   _32((VALUE)0), _33((VALUE)1)
  { }
  // constructor to build a matrix for scaling
  Mat4x4T(ArgInitScale, VALUE sx, VALUE sy, VALUE sz):
    _00((VALUE)sx), _01((VALUE)0),  _02((VALUE)0),  _03((VALUE)0),
    _10((VALUE)0),  _11((VALUE)sy), _12((VALUE)0),  _13((VALUE)0),
    _20((VALUE)0),  _21((VALUE)0),  _22((VALUE)sz), _23((VALUE)0),
    _30((VALUE)0),  _31((VALUE)0),  _32((VALUE)0),  _33((VALUE)1)
  { }
  // operator to allow access with [][]
  double* operator [] (int i)
  {
    assert(i >= 0 && i < 4);
    return comp + 4 * i;
  }
  // operator to allow access with [][]
  const double* operator [] (int i) const
  {
    assert(i >= 0 && i < 4);
    return comp + 4 * i;
  }
  // multiply matrix with matrix -> matrix
  Mat4x4T operator * (const Mat4x4T &mat) const
  {
    return Mat4x4T(
      _00 * mat._00 + _01 * mat._10 + _02 * mat._20 + _03 * mat._30,
      _00 * mat._01 + _01 * mat._11 + _02 * mat._21 + _03 * mat._31,
      _00 * mat._02 + _01 * mat._12 + _02 * mat._22 + _03 * mat._32,
      _00 * mat._03 + _01 * mat._13 + _02 * mat._23 + _03 * mat._33,
      _10 * mat._00 + _11 * mat._10 + _12 * mat._20 + _13 * mat._30,
      _10 * mat._01 + _11 * mat._11 + _12 * mat._21 + _13 * mat._31,
      _10 * mat._02 + _11 * mat._12 + _12 * mat._22 + _13 * mat._32,
      _10 * mat._03 + _11 * mat._13 + _12 * mat._23 + _13 * mat._33,
      _20 * mat._00 + _21 * mat._10 + _22 * mat._20 + _23 * mat._30,
      _20 * mat._01 + _21 * mat._11 + _22 * mat._21 + _23 * mat._31,
      _20 * mat._02 + _21 * mat._12 + _22 * mat._22 + _23 * mat._32,
      _20 * mat._03 + _21 * mat._13 + _22 * mat._23 + _23 * mat._33,
      _30 * mat._00 + _31 * mat._10 + _32 * mat._20 + _33 * mat._30,
      _30 * mat._01 + _31 * mat._11 + _32 * mat._21 + _33 * mat._31,
      _30 * mat._02 + _31 * mat._12 + _32 * mat._22 + _33 * mat._32,
      _30 * mat._03 + _31 * mat._13 + _32 * mat._23 + _33 * mat._33);
  }
  // multiply matrix with vector -> vector
  Vec4T<VALUE> operator * (const Vec4T<VALUE> &vec) const
  {
    return Vec4T<VALUE>(
      _00 * vec.x + _01 * vec.y + _02 * vec.z + _03 * vec.w,
      _10 * vec.x + _11 * vec.y + _12 * vec.z + _13 * vec.w,
      _20 * vec.x + _21 * vec.y + _22 * vec.z + _23 * vec.w,
      _30 * vec.x + _31 * vec.y + _32 * vec.z + _33 * vec.w);
  }
};

template <typename VALUE>
std::ostream& operator<<(std::ostream &out, const Mat4x4T<VALUE> &m)
{
  return out
    << m._00 << '\t' << m._01 << '\t' << m._02 << '\t' << m._03 << '\n'
    << m._10 << '\t' << m._11 << '\t' << m._12 << '\t' << m._13 << '\n'
    << m._20 << '\t' << m._21 << '\t' << m._22 << '\t' << m._23 << '\n'
    << m._30 << '\t' << m._31 << '\t' << m._32 << '\t' << m._33 << '\n';
}
typedef Mat4x4T<float> Mat4x4f;
typedef Mat4x4T<double> Mat4x4;

// enumeration of rotation axes
enum RotAxis {
  RotX, // rotation about x axis
  RotY, // rotation about y axis
  RotZ // rotation about z axis
};

// enumeration of possible Euler angles
enum EulerAngle {
  RotXYX = RotX + 3 * RotY + 9 * RotX, // 0 + 3 + 0 = 3
  RotXYZ = RotX + 3 * RotY + 9 * RotZ, // 0 + 3 + 18 = 21
  RotXZX = RotX + 3 * RotZ + 9 * RotX, // 0 + 6 + 0 = 6
  RotXZY = RotX + 3 * RotZ + 9 * RotY, // 0 + 6 + 9 = 15
  RotYXY = RotY + 3 * RotX + 9 * RotY, // 1 + 0 + 9 = 10
  RotYXZ = RotY + 3 * RotX + 9 * RotZ, // 1 + 0 + 18 = 19
  RotYZX = RotY + 3 * RotZ + 9 * RotX, // 1 + 6 + 0 = 7
  RotYZY = RotY + 3 * RotZ + 9 * RotY, // 1 + 6 + 9 = 16
  RotZXY = RotZ + 3 * RotX + 9 * RotY, // 2 + 0 + 9 = 11
  RotZXZ = RotZ + 3 * RotX + 9 * RotZ, // 2 + 0 + 18 = 20
  RotZYX = RotZ + 3 * RotY + 9 * RotX, // 2 + 3 + 0 = 5
  RotZYZ = RotZ + 3 * RotY + 9 * RotZ, // 2 + 3 + 18 = 23
  RotHPR = RotZXY, // used in OpenGL Performer
  RotABC = RotZYX // used in German engineering
};

/* decomposes the combined EULER angle type into the corresponding
 * individual EULER angle axis types.
 */
inline void decompose(
  EulerAngle type, RotAxis &axis1, RotAxis &axis2, RotAxis &axis3)
{
  unsigned type_ = (unsigned)type;
  axis1 = (RotAxis)(type_ % 3); type_ /= 3;
  axis2 = (RotAxis)(type_ % 3); type_ /= 3;
  axis3 = (RotAxis)type_;
}

template <typename VALUE>
Mat4x4T<VALUE> makeEuler(
  EulerAngle mode, VALUE rot1, VALUE rot2, VALUE rot3)
{
  RotAxis axis1, axis2, axis3;
  decompose(mode, axis1, axis2, axis3);
  const static VALUE axes[3][3] = {
    { (VALUE)1, (VALUE)0, (VALUE)0 },
    { (VALUE)0, (VALUE)1, (VALUE)0 },
    { (VALUE)0, (VALUE)0, (VALUE)1 }
  };
  return
      Mat4x4T<VALUE>(InitRot,
        Vec3T<VALUE>(axes[axis1][0], axes[axis1][1], axes[axis1][2]),
        rot1)
    * Mat4x4T<VALUE>(InitRot,
        Vec3T<VALUE>(axes[axis2][0], axes[axis2][1], axes[axis2][2]),
        rot2)
    * Mat4x4T<VALUE>(InitRot,
        Vec3T<VALUE>(axes[axis3][0], axes[axis3][1], axes[axis3][2]),
        rot3);
}

/* decomposes a rotation matrix into EULER angles.
 *
 * It is necessary that the upper left 3x3 matrix is composed of rotations
 * only. Translational parts are not considered.
 * Other transformations (e.g. scaling, shearing, projection) may cause
 * wrong results.
 */
template <typename VALUE>
void decompose(
  const Mat4x4T<VALUE> &mat,
  RotAxis axis1, RotAxis axis2, RotAxis axis3,
  VALUE &angle1, VALUE &angle2, VALUE &angle3)
{
  assert(axis1 != axis2 && axis2 != axis3);
  /* This is ported from EulerAngles.h of the Eigen library. */
  const int odd = (axis1 + 1) % 3 == axis2 ? 0 : 1;
  const int i = axis1;
  const int j = (axis1 + 1 + odd) % 3;
  const int k = (axis1 + 2 - odd) % 3;
  if (axis1 == axis3) {
    angle1 = atan2(mat[j][i], mat[k][i]);
    if ((odd && angle1 < (VALUE)0) || (!odd && angle1 > (VALUE)0)) {
      angle1 = angle1 > (VALUE)0 ? angle1 - (VALUE)Pi : angle1 + (VALUE)Pi;
      const VALUE s2 = length(Vec2T<VALUE>(mat[j][i], mat[k][i]));
      angle2 = -atan2(s2, mat[i][i]);
    } else {
      const VALUE s2 = length(Vec2T<VALUE>(mat[j][i], mat[k][i]));
      angle2 = atan2(s2, mat[i][i]);
    }
    const VALUE s1 = sin(angle1);
    const VALUE c1 = cos(angle1);
    angle3 = atan2(c1 * mat[j][k] - s1 * mat[k][k],
      c1 * mat[j][j] - s1 * mat[k][j]);
  } else {
    angle1 = atan2(mat[j][k], mat[k][k]);
    const VALUE c2 = length(Vec2T<VALUE>(mat[i][i], mat[i][j]));
    if ((odd && angle1<(VALUE)0) || (!odd && angle1 > (VALUE)0)) {
      angle1 = (angle1 > (VALUE)0)
        ? angle1 - (VALUE)Pi : angle1 + (VALUE)Pi;
      angle2 = atan2(-mat[i][k], -c2);
    } else angle2 = atan2(-mat[i][k], c2);
    const VALUE s1 = sin(angle1);
    const VALUE c1 = cos(angle1);
    angle3 = atan2(s1 * mat[k][i] - c1 * mat[j][i],
      c1 * mat[j][j] - s1 * mat[k][j]);
  }
  if (!odd) {
    angle1 = -angle1; angle2 = -angle2; angle3 = -angle3;
  }
}

#endif // LIN_MATH_H