C++ 反向索引：在一组文档中查找短语_C++_Algorithm_Intersection_Inverted Index

C++ 反向索引：在一组文档中查找短语

c++ algorithm

C++ 反向索引：在一组文档中查找短语,c++,algorithm,intersection,inverted-index,C++,Algorithm,Intersection,Inverted Index,我正在实现一个反向索引结构，特别是一个允许布尔查询和字级粒度的结构我有一个庞大的文本数据库，我保存了一个索引，告诉我每个单词在哪个文件中（IDdoc），以及它在文件中的位置（position）。（一个单词可以在多个文件中，也可以在一个文件中的多个位置。）因此，我为每个单词保留了一个向量： vector<pair<IDdoc,position>> occurences_of_word; 我不知道这是否是最有效的，但您可以从words[0]的文档/位置开始。然后转到wo

我正在实现一个反向索引结构，特别是一个允许布尔查询和字级粒度的结构

我有一个庞大的文本数据库，我保存了一个索引，告诉我每个单词在哪个文件中（

IDdoc

），以及它在文件中的位置（

position

）。（一个单词可以在多个文件中，也可以在一个文件中的多个位置。）

因此，我为每个单词保留了一个向量：

vector<pair<IDdoc,position>> occurences_of_word;

我不知道这是否是最有效的，但您可以从

words[0]

的文档/位置开始。然后转到

words[1]

，查找位置等于

words[0]的相交文档。位置+单词[0]。长度+1

，用于相同文档。然后同样地迭代剩余的

单词

。对于较长的短语，它应该很快缩小范围？

正如您所说，您使用的数据结构实际上是一个完整的倒排索引，正如维基百科所说：

倒排索引有两种主要变体：记录级倒排索引（或倒排文件索引或只是倒排文件）包含每个单词对文档的引用列表单词级倒排索引（或完整倒排索引或倒排列表）还包含文档中每个单词的位置。[2]后一种形式提供了更多功能（如短语搜索），但需要更多的时间和空间来创建

也就是说，您还可以尝试创建短语索引：

（参见图2作为演示）

如果您没有创建短语索引，那么（我相信）您可以做的就是检索包含特定单词的文档，在将查询从单词扩展到短语的过程中与您拥有的文档集相交，最后返回文档，查看您拥有的每个返回文档是否确实包含短语“而不是”在不同位置相互分隔的单词“

要从字符串表示中查找特定单词，您可能需要查看以下内容。用于创建您可能想要的结果的简单并集。这个实现更多的是作为一个演示，而不是作为一个非常理想的最终实现（c.f.草率短语解析）

如果第二个单词位于第一个单词后的位置，那么按照你可能说的方式写这篇文章似乎要清楚得多：

下一部分有点混乱，因为这两个子句似乎都是为了增加i和j并更新ID_doc_one和two，所以在if块之后将该部分提升到公共部分是有意义的，但同样地，

else{}

使得很难判断您实际在做什么

    if (pos_one + 1 == pos_two)
    {
        intersection.push_back(make_pair(ID_doc_one,pos_two));
        ID_doc_one = v1[++i].first;
        ID_doc_two = v2[++j].first;
    }

    else {
    }   // To avoid "out of range" errors
        if (i < SIZE_VECTOR_ONE - 1)
            ID_doc_one = v1[++i].first;
        if (j < SIZE_VECTOR_TWO - 1)
            ID_doc_two = v2[++j].first;
    }

if（位置1+1==位置2）
{
交叉点。向后推（形成一对（ID文档一，位置二））；
ID_doc_one=v1[++i]。第一；
ID_doc_two=v2[++j]。第一；
}
否则{
}//避免“超出范围”错误
if（i


当你匹配两个数组时，你总是想增加i和j，这不是条件，我也不知道你为什么要使用pos_二，因为这个短语实际上是在pos_一找到的
我会这样写：
#include<iostream>
#include<map>
#include<vector>
#include<string>

typedef std::string         Word_t;
typedef unsigned int        WordPosition_t;
typedef unsigned int        IDdocument_t;

typedef std::pair<IDdocument_t, WordPosition_t> DocumentPosition_t;
typedef std::vector<DocumentPosition_t> WordReferences_t;

WordReferences_t _intersect_two_words(const WordReferences_t& v1, const WordReferences_t& v2)
{
    // all the locations where the words occur one after the other.
    WordReferences_t intersection;

    auto firstIt = v1.begin();
    auto secondIt = v2.begin();
    while (firstIt != v1.end() && secondIt != v2.end())
    {
        if (firstIt->first < secondIt->first)
        {
            ++firstIt;
            continue;
        }
        // find the second word in the same document and AFTER the first word.
        if (secondIt->first < firstIt->first || secondIt->second < firstIt->second + 1)
        {
            ++secondIt;
            continue;
        }
        // first word wasn't just before the second, it's not a phrase.
        if (secondIt->second > firstIt->second + 1)
        {
            ++firstIt;
            continue;
        }
        // We found a phrase.
        intersection.emplace_back(*firstIt);
        ++firstIt;
        ++secondIt;
    }

    return intersection;
}

int main()
{
    WordReferences_t v1, v2;
    v1.push_back(std::make_pair(10, 5));
    v1.push_back(std::make_pair(10, 25));
    v1.push_back(std::make_pair(11, 10));
    v1.push_back(std::make_pair(12, 1));
    v1.push_back(std::make_pair(12, 11));
    v1.push_back(std::make_pair(12, 21));
    v1.push_back(std::make_pair(12, 31));
    v1.push_back(std::make_pair(15, 11));
    v1.push_back(std::make_pair(100, 1));
    v1.push_back(std::make_pair(100, 11));
    v1.push_back(std::make_pair(100, 21));
    v1.push_back(std::make_pair(101, 11));
    v1.push_back(std::make_pair(102, 11));
    v1.push_back(std::make_pair(102, 13));
    v1.push_back(std::make_pair(102, 14));
    v1.push_back(std::make_pair(103, 11));
    v1.push_back(std::make_pair(103, 13));

    v2.push_back(std::make_pair(10, 11));
    v2.push_back(std::make_pair(12, 10));
    v2.push_back(std::make_pair(12, 40));
    v2.push_back(std::make_pair(16, 11));
    v2.push_back(std::make_pair(100, 12)); // match
    v2.push_back(std::make_pair(101, 12)); // match
    v2.push_back(std::make_pair(101, 13));
    v2.push_back(std::make_pair(101, 14));
    v2.push_back(std::make_pair(102, 12)); //match
    v2.push_back(std::make_pair(103, 1));
    v2.push_back(std::make_pair(103, 10));
    v2.push_back(std::make_pair(103, 12)); // match
    v2.push_back(std::make_pair(103, 15));

    auto intersection = _intersect_two_words(v1, v2);
    for (auto entry : intersection)
    {
        std::cout << entry.first << ", " << entry.second << "+" << (entry.second + 1) << std::endl;
    }

    return 0;
}

#包括
#包括
#包括
#包括
typedef std:：字符串单词；
typedef unsigned int-WordPosition\u t；
typedef unsigned int-IDdocument\u t；
typedef std：：对文档位置\u t；
typedef std：：向量字引用\u t；
单词引用与两个单词相交（常量单词引用与v1、常量单词引用与v2）
{
//单词一个接一个出现的所有位置。
字交叉；
auto firstIt=v1.begin（）；
auto secondIt=v2.begin（）；
而（firstIt！=v1.end（）&&secondIt！=v2.end（））
{
if（firstIt->firstfirst）
{
++第一；
继续；
}
//在同一文档中找到第一个单词之后的第二个单词。
如果（secondIt->firstfirst | | secondIt->secondsecond+1）
{
++第二；
继续；
}
//第一个词不在第二个词之前，它不是一个短语。
如果（第二次->第二次->第一次->第二次+1）
{
++第一；
继续；
}
//我们找到了一个短语。
交叉口。向后布设（*firstIt）；
++第一；
++第二；
}
折返交叉口；
}
int main（）
{
文字参考文献v1，v2；
v1.推回（标准：：形成一对（10，5））；
v1.推回（标准：：形成一对（10，25））；
v1.推回（标准：：形成一对（11，10））；
v1.推回（标准：：形成一对（12，1））；
v1.推回（标准：：形成一对（12，11））；
v1.推回（标准：：形成一对（12，21））；
v1.推回（标准：：形成一对（12，31））；
v1.推回（标准：：形成一对（15，11））；
v1.推回（标准：：使_对（100，1））；
v1.推回（标准：：形成一对（100，11））；
v1.推回（标准：：形成一对（100，21））；
v1.推回（标准：：形成对（101，11））；
v1.推回（标准：：形成一对（102，11））；
v1.推回（标准：：形成一对（102，13））；
v1.推回（标准：：形成一对（102，14））；
v1.推回（标准：：形成一对（103，11））；
v1.推回（标准：：形成一对（103，13））；
v2.推回（标准：：形成一对（10，11））；
v2.推回（标准：：形成一对（12，10））；
v2.推回（标准：：形成一对（12，40））；
v2.推回（标准：：形成对（16，11））；
v2.向后推（std:：make_pair（100，12））；//匹配
v2.向后推（std:：make_pair（101，12））；//匹配
v2.推回（标准：：形成对（101，13））；
v2.推回（标准：：形成对（101，14））；
v2.向后推（std:：make_pair（102，12））；//匹配
v2.推回（标准：：形成对（103，1））；
v2.推回（标准：：使_对（103，10））；
v2.向后推（std:：make_pair（103，12））；//匹配
v2.推回（标准：：形成一对（103，15））；
while(i < SIZE_VECTOR_ONE  && j < SIZE_VECTOR_TWO)
{
    if (ID_doc_one < ID_doc_two)
    {
        ID_doc_one = v1[++i].first;

if (0 < 1) {
    ID_doc_one = v1[1].first;

while (oneIt != v1.end() && twoIt != v2.end()) {
    if (oneIt->first < twoIt->first) {
        ++oneIt;
        continue;
    } else if (*twoIt < *oneIt) {
        ++twoIt;
        continue;
    }
    // same documentId in both lists, snag positions.
    ...
}

    else {
    }   // To avoid "out of range" errors <-- but also ends the "else"
        if (i < SIZE_VECTOR_ONE - 1)
            ID_doc_one = v1[++i].first;
        if (j < SIZE_VECTOR_TWO - 1)
            ID_doc_two = v2[++j].first;
    }

    WordPosition_t pos_one = v1[i].second;
    WordPosition_t pos_two = v2[j].second;

    // The words make a phrase!  Return pos_two for the next intersection finding step
    if (pos_one + 1 == pos_two)

    WordPosition_t posFirstWord = v1[i].second;
    WordPosition_t posSecondWord = v2[j].second;

    // The words make a phrase!  Return pos_two for the next intersection finding step
    if (posSecondWord == posFirstWord + 1)

    if (pos_one + 1 == pos_two)
    {
        intersection.push_back(make_pair(ID_doc_one,pos_two));
        ID_doc_one = v1[++i].first;
        ID_doc_two = v2[++j].first;
    }

    else {
    }   // To avoid "out of range" errors
        if (i < SIZE_VECTOR_ONE - 1)
            ID_doc_one = v1[++i].first;
        if (j < SIZE_VECTOR_TWO - 1)
            ID_doc_two = v2[++j].first;
    }

#include<iostream>
#include<map>
#include<vector>
#include<string>

typedef std::string         Word_t;
typedef unsigned int        WordPosition_t;
typedef unsigned int        IDdocument_t;

typedef std::pair<IDdocument_t, WordPosition_t> DocumentPosition_t;
typedef std::vector<DocumentPosition_t> WordReferences_t;

WordReferences_t _intersect_two_words(const WordReferences_t& v1, const WordReferences_t& v2)
{
    // all the locations where the words occur one after the other.
    WordReferences_t intersection;

    auto firstIt = v1.begin();
    auto secondIt = v2.begin();
    while (firstIt != v1.end() && secondIt != v2.end())
    {
        if (firstIt->first < secondIt->first)
        {
            ++firstIt;
            continue;
        }
        // find the second word in the same document and AFTER the first word.
        if (secondIt->first < firstIt->first || secondIt->second < firstIt->second + 1)
        {
            ++secondIt;
            continue;
        }
        // first word wasn't just before the second, it's not a phrase.
        if (secondIt->second > firstIt->second + 1)
        {
            ++firstIt;
            continue;
        }
        // We found a phrase.
        intersection.emplace_back(*firstIt);
        ++firstIt;
        ++secondIt;
    }

    return intersection;
}

int main()
{
    WordReferences_t v1, v2;
    v1.push_back(std::make_pair(10, 5));
    v1.push_back(std::make_pair(10, 25));
    v1.push_back(std::make_pair(11, 10));
    v1.push_back(std::make_pair(12, 1));
    v1.push_back(std::make_pair(12, 11));
    v1.push_back(std::make_pair(12, 21));
    v1.push_back(std::make_pair(12, 31));
    v1.push_back(std::make_pair(15, 11));
    v1.push_back(std::make_pair(100, 1));
    v1.push_back(std::make_pair(100, 11));
    v1.push_back(std::make_pair(100, 21));
    v1.push_back(std::make_pair(101, 11));
    v1.push_back(std::make_pair(102, 11));
    v1.push_back(std::make_pair(102, 13));
    v1.push_back(std::make_pair(102, 14));
    v1.push_back(std::make_pair(103, 11));
    v1.push_back(std::make_pair(103, 13));

    v2.push_back(std::make_pair(10, 11));
    v2.push_back(std::make_pair(12, 10));
    v2.push_back(std::make_pair(12, 40));
    v2.push_back(std::make_pair(16, 11));
    v2.push_back(std::make_pair(100, 12)); // match
    v2.push_back(std::make_pair(101, 12)); // match
    v2.push_back(std::make_pair(101, 13));
    v2.push_back(std::make_pair(101, 14));
    v2.push_back(std::make_pair(102, 12)); //match
    v2.push_back(std::make_pair(103, 1));
    v2.push_back(std::make_pair(103, 10));
    v2.push_back(std::make_pair(103, 12)); // match
    v2.push_back(std::make_pair(103, 15));

    auto intersection = _intersect_two_words(v1, v2);
    for (auto entry : intersection)
    {
        std::cout << entry.first << ", " << entry.second << "+" << (entry.second + 1) << std::endl;
    }

    return 0;
}