C++ 如何计算（A*B）%C？_C++_Math_Optimization_Numbers

C++ 如何计算（A*B）%C？

c++ math optimization

C++ 如何计算（A*B）%C？,c++,math,optimization,numbers,C++,Math,Optimization,Numbers,有人能帮我计算一下（A*B）%C，在那里1（未经广泛测试）这个版本只有一个长整数模——它甚至可能比大数据块方法更快，这取决于处理器实现整数模的方式现场演示：--与雅克的结果相同堆栈溢出的一种实现： #include <stdint.h> #include <tuple> #include <iostream> typedef std::tuple< uint32_t, uint32_t > split_t; split_t split(

有人能帮我计算一下

（A*B）%C

，在那里

1（未经广泛测试）
这个版本只有一个长整数模——它甚至可能比大数据块方法更快，这取决于处理器实现整数模的方式

现场演示：--与雅克的结果相同
堆栈溢出的一种实现：
#include <stdint.h>
#include <tuple>
#include <iostream>

typedef std::tuple< uint32_t, uint32_t > split_t;
split_t split( uint64_t a )
{
  static const uint32_t mask = -1;
  auto retval = std::make_tuple( mask&a, ( a >> 32 ) );
  // std::cout << "(" << std::get<0>(retval) << "," << std::get<1>(retval) << ")\n";
  return retval;
}

typedef std::tuple< uint64_t, uint64_t, uint64_t, uint64_t > cross_t;
template<typename Lambda>
cross_t cross( split_t lhs, split_t rhs, Lambda&& op )
{
  return std::make_tuple( 
    op(std::get<0>(lhs), std::get<0>(rhs)),
    op(std::get<1>(lhs), std::get<0>(rhs)),
    op(std::get<0>(lhs), std::get<1>(rhs)),
    op(std::get<1>(lhs), std::get<1>(rhs))
  );
}

// c must have high bit unset:
uint64_t a_times_2_k_mod_c( uint64_t a, unsigned k, uint64_t c )
{
  a %= c;
  for (unsigned i = 0; i < k; ++i)
  {
    a <<= 1;
    a %= c;
  }
  return a;
}

// c must have about 2 high bits unset:
uint64_t a_times_b_mod_c( uint64_t a, uint64_t b, uint64_t c )
{
  // ensure a and b are < c:
  a %= c;
  b %= c;
  
  auto Z = cross( split(a), split(b), [](uint32_t lhs, uint32_t rhs)->uint64_t {
    return (uint64_t)lhs * (uint64_t)rhs;
  } );
  
  uint64_t to_the_0;
  uint64_t to_the_32_a;
  uint64_t to_the_32_b;
  uint64_t to_the_64;
  std::tie( to_the_0, to_the_32_a, to_the_32_b, to_the_64 ) = Z;
  
  // std::cout << to_the_0 << "+ 2^32 *(" << to_the_32_a << "+" << to_the_32_b << ") + 2^64 * " << to_the_64 << "\n";
  
  // this line is the one that requires 2 high bits in c to be clear
  // if you just add 2 of them then do a %c, then add the third and do
  // a %c, you can relax the requirement to "one high bit must be unset":
  return
    (to_the_0
    + a_times_2_k_mod_c(to_the_32_a+to_the_32_b, 32, c) // + will not overflow!
    + a_times_2_k_mod_c(to_the_64, 64, c) )
  %c;
}

int main()
{
  uint64_t retval = a_times_b_mod_c( 19010000000000000000, 1011000000000000, 1231231231231211 );
  std::cout << retval << "\n";
}

#包括
#包括
#包括
typedef std:：tuplesplit\u t；
分割分割（uint64分割）
{
静态常数32_t mask=-1；
auto-retval=std:：make_tuple（mask&a，（a>>32））；
//std：：我可以发誓这是一堆东西的复制品。但是它们很难找到…我正要回答，然后我看到@Mystical在这里，哈哈，我出去了，我很难找到我要找的那个…：（我相信你可以尝试将数字以二进制形式存储在数组中，然后像使用移位运算符一样使用它。@devnull SO的目标是成为一个对其他人有用的QA档案。如果Q碰巧不够努力，那么如果它能帮到忙，那就这样吧。这个问题与“通常”不同缺乏努力的问题，因为它不是太本地化（它甚至有无数次重复）。只需在SO上列出最重要的问题。其中许多问题显示出同样的努力。但他们有数千张选票，因为他们很有帮助-在实现SO目标的情况下。if（mod_product>C）mod_product-=C；
不会让它更快，我敢打赌——用分支取代%
不是一个胜利：大tmp[]={mod_product，mod_product-C}；mod_product=tmp[mod_product>=C]
是等效的，但在大多数现代处理器/编译器下，速度要快得多。同样地，杀死A&1
分支也会很好。@Yakk：这完全取决于处理器。有些处理器有条件mov指令，效率非常高。在另一些处理器上，数组查找可能会更好。但是64位%
非常可笑在几乎所有的系统上都很慢，在很多情况下比管道刷新慢。在数组查找更快的处理器上，你不认为优化器已经知道了这一点吗？此外，如果优化器不能完成消除分支的任务，我有其他技巧可以在查找表之前使用。我还没有遇到很多编译器能够做到这一点完全消除这样的分支……我没有想到64位上%
的慢。@Yakk：整数除法和模是慢指令。即使在每个指令都有统一的周期计数的处理器上，这些通常也是例外。与数组查找不同，为什么不A-=C*（A>=C）
？@BenVoigt没有很好的理由。另外，你对a_times_2_k_mod_c（对_64，64，c）的一次调用和我的整个函数几乎都是一样的工作。我看不出你的分割方法有什么好处……或者是在一堆新奇的C++模板中做。BenVoigt会用减法加4乘法，除非我算错了。很可能是我做的。你的更干净了。我只是移植了某人的帖子。o对一个实现来说，这是一个很粗糙的问题。是的，但是如果你运行移位循环96次而不是（最多）64次，你可能会失去任何好处。也许a_times\u 2_k_mod_c（to_32_a+to_32_b+a_times\u mod_c）（to_64，32，c），32，c）会有所帮助。
typedef unsigned long long BIG;
BIG mod_multiply( BIG A, BIG B, BIG C )
{
    BIG mod_product = 0;
    // A %= C; may or may not help performance
    B %= C;

    while (A) {
        if (A & 1) {
            mod_product += B;
            if (mod_product > C) mod_product -= C;
        }
        A >>= 1;
        B <<= 1;
        if (B > C) B -= C;
    }

    return mod_product;
}

#include <stdint.h>
#include <tuple>
#include <iostream>

typedef std::tuple< uint32_t, uint32_t > split_t;
split_t split( uint64_t a )
{
  static const uint32_t mask = -1;
  auto retval = std::make_tuple( mask&a, ( a >> 32 ) );
  // std::cout << "(" << std::get<0>(retval) << "," << std::get<1>(retval) << ")\n";
  return retval;
}

typedef std::tuple< uint64_t, uint64_t, uint64_t, uint64_t > cross_t;
template<typename Lambda>
cross_t cross( split_t lhs, split_t rhs, Lambda&& op )
{
  return std::make_tuple( 
    op(std::get<0>(lhs), std::get<0>(rhs)),
    op(std::get<1>(lhs), std::get<0>(rhs)),
    op(std::get<0>(lhs), std::get<1>(rhs)),
    op(std::get<1>(lhs), std::get<1>(rhs))
  );
}

// c must have high bit unset:
uint64_t a_times_2_k_mod_c( uint64_t a, unsigned k, uint64_t c )
{
  a %= c;
  for (unsigned i = 0; i < k; ++i)
  {
    a <<= 1;
    a %= c;
  }
  return a;
}

// c must have about 2 high bits unset:
uint64_t a_times_b_mod_c( uint64_t a, uint64_t b, uint64_t c )
{
  // ensure a and b are < c:
  a %= c;
  b %= c;
  
  auto Z = cross( split(a), split(b), [](uint32_t lhs, uint32_t rhs)->uint64_t {
    return (uint64_t)lhs * (uint64_t)rhs;
  } );
  
  uint64_t to_the_0;
  uint64_t to_the_32_a;
  uint64_t to_the_32_b;
  uint64_t to_the_64;
  std::tie( to_the_0, to_the_32_a, to_the_32_b, to_the_64 ) = Z;
  
  // std::cout << to_the_0 << "+ 2^32 *(" << to_the_32_a << "+" << to_the_32_b << ") + 2^64 * " << to_the_64 << "\n";
  
  // this line is the one that requires 2 high bits in c to be clear
  // if you just add 2 of them then do a %c, then add the third and do
  // a %c, you can relax the requirement to "one high bit must be unset":
  return
    (to_the_0
    + a_times_2_k_mod_c(to_the_32_a+to_the_32_b, 32, c) // + will not overflow!
    + a_times_2_k_mod_c(to_the_64, 64, c) )
  %c;
}

int main()
{
  uint64_t retval = a_times_b_mod_c( 19010000000000000000, 1011000000000000, 1231231231231211 );
  std::cout << retval << "\n";
}