C++ 计算表示稀疏向量c++；简介和源代码_C++_Dictionary_Vector_Distance_Cosine Similarity

C++ 计算表示稀疏向量c++；简介和源代码

c++ dictionary vector

C++ 计算表示稀疏向量c++；简介和源代码,c++,dictionary,vector,distance,cosine-similarity,C++,Dictionary,Vector,Distance,Cosine Similarity,我试图计算两个维度为169647的稀疏向量之间的余弦相似性。作为输入，这两个向量表示为形式的字符串。只有向量的非零元素才被赋予索引 x = "1:0.1 43:0.4 100:0.43 10000:0.9" y = "200:0.5 500:0.34 501:0.34" 首先，我们使用函数splitVector将x和y分别转换为两个向量。。然后我们使用函数cosine\u相似度计算距离。无需使用拆分函数。我正在使用它，以防您希望运行代码 #includ

我试图计算两个维度为169647的稀疏向量之间的余弦相似性。作为输入，这两个向量表示为

形式的字符串。只有向量的非零元素才被赋予索引

x = "1:0.1 43:0.4 100:0.43 10000:0.9"
y = "200:0.5 500:0.34 501:0.34"

首先，我们使用函数

splitVector

将x和y分别转换为两个

向量。

。然后我们使用函数

cosine\u相似度

计算距离。无需使用拆分

函数。我正在使用它，以防您希望运行代码
#include <iostream>
#include <string>
#include <vector> 
#include <algorithm>

using namespace std;

void split(const string& s, char c,vector<string>& v) {
   string::size_type i = 0;
   string::size_type j = s.find(c);

   while (j != string::npos) {
      v.push_back(s.substr(i, j-i));
      i = ++j;
      j = s.find(c, j);

      if (j == string::npos)
         v.push_back(s.substr(i, s.length()));
   }
}

float cosine_similarity(const std::vector<float> & A,const std::vector<float> & B)
{
    float dot = 0.0, denom_a = 0.0, denom_b = 0.0 ;
    for(unsigned int i = 0; i < A.size(); ++i)
    {
        dot += A[i] * B[i] ;
        denom_a += A[i] * A[i] ;
        denom_b += B[i] * B[i] ;
    }
    return dot / (sqrt(denom_a) * sqrt(denom_b)) ;
}

void splitVector(const vector<string> & v, vector<float> & values)
{
    vector<string> tmpv;
    string parsed;
    for(unsigned int i = 0; i < v.size(); i++)
    {
        split(v[i], ':', tmpv);
        int idx = atoi(tmpv[0].c_str());
        float val = atof(tmpv[1].c_str()); 
    tmpv.clear();
    values[idx] = val;
    }//end for;
}//end function

int main()
{
   //INPUT VECTORS.
   vector<string> x {"1:0.1","43:0.4","50:0.43","90:0.9"};
   vector<string> y {"20:0.5","40:0.34","50:0.34"};
   
   //STEP 1: Initialize vectors
   int dimension = 169647;
   vector<float> X;
   X.resize(dimension, 0.0);
   
   vector<float> Y;
   Y.resize(dimension, 0.0);
   
   //STEP 2: CREATE FLOAT VECTORS
   splitVector(x, X);
   splitVector(y, Y);
   
   //STEP 3: COMPUTE COSINE SIMILARITY
   cout << cosine_similarity(X,Y) << endl;
}

#包括
#包括
#包括
#包括
使用名称空间std；
无效拆分（常量字符串和s、字符c、向量和v）{
字符串：：大小\类型i=0；
字符串：：size_type j=s.find（c）；
while（j！=string:：npos）{
v、 向后推（s.substr（i，j-i））；
i=++j；
j=s.find（c，j）；
if（j==string:：npos）
v、 向后推（s.substr（i，s.length（））；
}
}
浮点余弦_相似性（常数std:：vector&A，常数std:：vector&B）
{
浮点数=0.0，最小a=0.0，最小b=0.0；
for（无符号整数i=0；i
关于你的问题：

原始复杂性为Θ（N2）
您建议的解决方案的复杂性是O（（m+n）log（max（m，n）），这可能要小得多；相反，使用，您可以将其减少到预期的O（m+n）
听起来不错，但是，像往常一样-YMMV。您应该在整个应用程序的上下文中分析此操作（以查看是否存在问题），以及此操作中的步骤
稀疏向量的常见表示形式是一个简单的索引数组和一个值数组，或者有时是一个索引数组和一对值数组，通常需要与值一起访问索引（除非您不喜欢向量长度/归一化或类似）建议使用另外两种形式：使用std:：map
和std:：unordered\u map

请在最后找到结论
基准
我为这四种表示实现了向量运算长度和内积（点积）。此外，我还采用OP问题中建议的非常直接的方法实现了内积，并对向量对实现进行了改进的余弦距离计算
完整代码
我已经在这些实现上运行了一个基准测试。您可以从这里查看我的代码，我从中获取了以下数字（尽管这些比率与我自己的机器上的运行非常吻合，只是更高的RunCount
，随机输入向量的分布更均匀）。以下是结果：
结果
成对容器
以下是排序的向量
和映射
的点
的实现：
LenSqr
的实现：
template<class PairContainer>
inline float LenSqrPairs(const PairContainer& vec) {
    float dot = 0;
    for(auto& pair : vec)
        dot += pair.second * pair.second;
    return dot;
}

special[2]
在内积过程中迭代两个向量时，只需计算两个向量的平方范数（查看完整的代码了解详细信息）。我添加了这一点来证明一点：缓存命中率很重要。如果我能更有效地访问内存，我可以用向量对1来击败向量对的天真方法（当然，如果您优化了其他路径，情况也是如此）
结论
请注意，所有经过测试的实现（具有O（k*logk）
行为的special[1]
除外）都表现出O（k）的理论运行时行为
其中k
是稀疏向量中非零的数量：这对于map和vector来说是微不足道的，因为Dot
的实现是相同的，无序map通过在O（1）
摊销中实现find
来实现这一点
那么，为什么映射是稀疏向量的错误工具呢？对于std:：map
来说，答案是迭代树结构的开销，对于std:：unordered_map
来说，查找的随机内存访问模式，这两种模式都会在缓存未命中期间产生巨大的开销
为了揭开std:：unordered_map
优于std:：map
的理论优势，请检查special[1]的结果
。这是std:：unordered_map
正在击败的实现，不是因为它更适合这个问题，而是因为使用std:：map
的实现是次优的。
您考虑过只使用现有库来计算它吗？我认为使用向量计算它可能比使用映射更快。。什么向量是否存在问题？如果使用matlab:）@HaniGoc更好如果索引已经有序（在大多数稀疏向量表示中是有序的），则映射
具有不必要的O（nlogn）
构造时间，因此它将主导距离比较，即O（n）。然后是迭代两个映射的开销
float cosine_similarity(const std::map<int,float> & A,const std::map<int,float> & B)
{
    float dot = 0.0, denom_a = 0.0, denom_b = 0.0 ;
    for(auto &a:A)
    { 
      denom_a += a.second * a.second ;
    }
    
    for(auto &b:B)
    { 
      denom_b += b.second * b.second ;
    }
    
    for(auto &a:A)
    {  
        if(B.find(a.first) != B.end())
        {
          dot +=  a.second * B.find(a.first)->second ;
        }  
    }
    return dot / (sqrt(denom_a) * sqrt(denom_b)) ;
}

Explanation of the output of the benchmark:
  pairs: implementation using (sorted) std::vector of pairs
  map'd: implementation using std::map
  hashm: implementation using std::unordered_map
  class: implementation using two separate std::vector for indices and values respectively
  specl dot (naive map): dot product using map.find instead of proper iteration
  specl cos (optimised): cosine distance iterating only once over both vectors

Columns are the percentage of non-zeros in the random sparse vector (on average).
Values are in terms of the vector of pairs implementation
(1: equal runtime, 2: took twice as long, 0.5: took half as long).

                    inner product (dot)
            5%          10%          15%          25%
map'd       3.3          3.5          3.7          4.0
hashm       3.6          4.0          4.8          5.2
class       1.1          1.1          1.1          1.1
special[1]  8.3          9.8         10.7         10.8

                    norm squared (len2)
            5%          10%          15%          25%
map'd       6.9          7.6          8.3         10.2
hashm       2.3          3.6          4.1          4.8
class       0.98         0.95         0.93         0.75

                    cosine distance (cos)
            5%          10%          15%          25%
map'd       4.0          4.3          4.6          5.0
hashm       3.2          3.9          4.6          5.0
class       1.1          1.1          1.1          1.1
special[2]  0.92         0.95         0.93         0.94
template<class Vector>
inline float CosineDistance(const Vector& lhs, const Vector& rhs) {
    return Dot(lhs, rhs) / std::sqrt(LenSqr(lhs) * LenSqr(rhs));
}

template<class PairContainerSorted>
inline float DotPairsSorted(const PairContainerSorted& lhs, const PairContainerSorted& rhs) {
    float dot = 0;
    for(auto pLhs = lhs.begin(), pRhs = rhs.begin(), endLhs = lhs.end(), endRhs = rhs.end(); pRhs != endRhs;) {
        for(; pLhs != endLhs && pLhs->first < pRhs->first; ++pLhs);
        if(pLhs == endLhs)
            break;
        for(; pRhs != endRhs && pRhs->first < pLhs->first; ++pRhs);
        if(pRhs == endRhs)
            break;
        if(pLhs->first == pRhs->first) {
            dot += pLhs->second * pRhs->second;
            ++pLhs;
            ++pRhs;
        }
    }
    return dot;
}

template<class PairMap>
inline float DotPairsMapped(const PairMap& lhs, const PairMap& rhs) {
    float dot = 0;
    for(auto& pair : lhs) {
        auto pos = rhs.find(pair.first);
        if(pos != rhs.end())
            dot += pair.second * pos->second;
    }
    return dot;
}

template<class PairContainer>
inline float LenSqrPairs(const PairContainer& vec) {
    float dot = 0;
    for(auto& pair : vec)
        dot += pair.second * pair.second;
    return dot;
}

inline float Dot(const SparseVector& lhs, const SparseVector& rhs) {
    float dot = 0;
    if(!lhs.idx.empty() && !rhs.idx.empty()) {
        const size_t *itIdxLhs = &lhs.idx[0], *endIdxLhs = &lhs.idx[0] + lhs.idx.size();
        const float *itValLhs = &lhs.val[0], *endValLhs = &lhs.val[0] + lhs.val.size();
        const size_t *itIdxRhs = &rhs.idx[0], *endIdxRhs = &rhs.idx[0] + rhs.idx.size();
        const float *itValRhs = &rhs.val[0], *endValRhs = &rhs.val[0] + rhs.val.size();
        while(itIdxRhs != endIdxRhs) {
            for(; itIdxLhs < endIdxLhs && *itIdxLhs < *itIdxRhs; ++itIdxLhs, ++itValLhs);
            if(itIdxLhs == endIdxLhs)
                break;
            for(; itIdxRhs < endIdxRhs && *itIdxRhs < *itIdxLhs; ++itIdxRhs, ++itValRhs);
            if(itIdxRhs == endIdxRhs)
                break;
            if(*itIdxLhs == *itIdxRhs) {
                dot += (*itValLhs) * (*itValRhs);
                ++itIdxLhs;
                ++itValLhs;
                ++itIdxRhs;
                ++itValRhs;
            }
        }
    }
    return dot;
}

inline float LenSqr(const SparseVector& vec) {
    float dot = 0;
    for(float v : vec.val)
        dot += v * v;
    return dot;
}