倒排索引：在一組文檔

我執行的倒排索引結構，特別是一個允許布爾查詢和字級粒度查找短語。倒排索引：在一組文檔

我有一個大型的文本數據庫，並且我保留一個索引，告訴我每個單詞，它是哪個文件（IDdoc），以及它在哪個文件中（position）。（一個字可以在許多文件，並在一個文件中的許多地方。）

因此，我保持一個向量每個字：

vector<pair<IDdoc,position>> occurences_of_word;

（矢量由IDdoc，然後按位置排序，升序）。

我有一個string物體由字。這是短語我在找。

對於在短語每個字我想知道哪些文件包含這個詞組，因此返回的IDdoc秒的載體。

這是我在一個解決方案的嘗試：

typedef std::string  Word_t; 
typedef unsigned int WordPosition_t; 
typedef unsigned int IDdocument_t; 

vector<pair<IDdocument_t,WordPosition_t> > IndiceInvertidoBooleanoConPosicion::_interseccion_dos_listas 
    (const vector<pair<IDdocument_t,WordPosition_t>> & v1, 
    const vector<pair<IDdocument_t,WordPosition_t>> & v2) 
{ 
vector<pair<IDdocument_t,WordPosition_t> > intersection; 

IDdocument_t ID_doc_one, ID_doc_two; 

int i = 0; 
int j = 0; 
const int MAX_INDEX_V1 = v1.size() -1; 
const int MAX_INDEX_V2 = v2.size() -1; 

while(i <= MAX_INDEX_V1 && j <= MAX_INDEX_V2) 
{ 
    ID_doc_one = v1[i].first; 
    ID_doc_two = v2[j].first; 
    if (ID_doc_one < ID_doc_two) 
     i++; 
    else if (ID_doc_one > ID_doc_two) 
     j++; 
    else // The words were found in the same document! 
    { 
     WordPosition_t pos_word_one = v1[i].second; 
     WordPosition_t pos_word_two = v2[j].second; 

     // The words make a phrase! Return pos_two for the next intersection finding step 
     if (pos_word_one + 1 == pos_word_two) 
     { 
      intersection.push_back(make_pair(ID_doc_one,pos_word_two)); 
      i++; 
      j++; 
     } 

     // Phrase not found 
     else 
     { 
      if (pos_word_one < pos_word_two) 
       i++; 
      else 
       j++; 
     } 

    } 
} 

return intersection; 
} 

int find_phrase(const string phrase, vector<IDdocument_t> & id_docs) 
{ 
Word_t word; 
id_docs.clear(); 
Text parsed_phrase; 
// Extract the relevant words from the phrase 
parsed_phrase.parse(phrase); 

vector<pair<IDdocument_t,WordPosition_t> > intersection; 
vector<pair<IDdocument_t,WordPosition_t> > second_vector; 

while (parsed_phrase.get_next_word(word) != RES_END) 
{ 
    _find_vector_words(word,intersection); 

    while (parsed_phrase.get_next_word(word) != RES_END) 
    { 
     _find_vector_words(word,second_vector); 

     intersection = _intersect_two_words(intersection,second_vector); 

    } 
} 

for (unsigned int i = 0; i < intersection.size(); i ++) 
{ 
    IDdocument_t id_doc = intersection[i].first; 
    if(std::find(id_docs.begin(), id_docs.end(), id_doc) == id_docs.end()) 
     id_docs.push_back(id_doc); 
} 

return RES_OK; 
}

來源

2013-06-27 Maria Ines Parnisari

不知道你在問究竟 - 你問如何確定哪些文件包含「A頭號飛利浦螺絲刀「，或者哪些文件包含單詞」A「，」編號「，」一個「，」philips「或」螺絲刀「。如果前者，他們是否必須是連續的或將「一把螺絲刀的手柄數量是一個飛利浦和pozidrive」是一個匹配？ –

@MatsPetersson，他們需要是連續的。 –

相關：http://stackoverflow.com/questions/2659120/how-to-search-phrase-queries-in-inverted-index-structure – jogojapan

對於從字符串表示查找一個特定的單詞，你可能想看看像map。爲了創建一個簡單的結果聯合，你可能需要set。這個實現更像是一個演示而不是一個非常理想的最終實現（c.f.）。草率的短語解析）。

#include <vector> 
#include <map> 
#include <set> 
#include <iostream> 
#include <string> 

typedef std::string IDdoc; 
typedef int position; 

typedef std::pair<IDdoc,position> Occurrence; 
typedef std::vector<Occurrence> OccurrencesOfWord; 
typedef std::map<std::string /*word*/, OccurrencesOfWord> Dictionary; 
typedef std::set<IDdoc> Matches; 

bool findMatchesForPhrase(const std::string& phrase, const Dictionary& dictionary, Matches& matches) 
{ 
    size_t pos = 0; 
    size_t len = 0; 
    while (pos < phrase.length()) { 
     size_t end = phrase.find(' ', pos); 
     size_t len = ((end == phrase.npos) ? phrase.length() : end) - pos; 
     std::string word(phrase, pos, len); 
     pos += len + 1; // to skip the space. 

     // ignore words not in the dictionary. 
     auto dictIt = dictionary.find(word); 
     if (dictIt == dictionary.end()) 
      continue; 

     auto& occurrences = dictIt->second; // shortcut/alias,. 
     for (auto& occurIt : occurrences) { 
      // Add all the IDdoc's of this occurence to the set. 
      matches.insert(occurIt.first); 
     } 
    } 

    return !matches.empty(); 
} 

void addToDictionary(Dictionary& dict, const char* word, const char* doc, int position) 
{ 
    dict[word].push_back(std::make_pair(std::string(doc), position)); 
} 

int main(int argc, const char** argv) 
{ 
    std::string phrase("pizza is life"); 
    Dictionary dict; 

    addToDictionary(dict, "pizza", "book1", 10); 
    addToDictionary(dict, "pizza", "book2", 30); 
    addToDictionary(dict, "life", "book1", 1); 
    addToDictionary(dict, "life", "book3", 1); 
    addToDictionary(dict, "goat", "book4", 99); 

    Matches matches; 
    bool result = findMatchesForPhrase(phrase, dict, matches); 

    std::cout << "result = " << result << std::endl; 
    for (auto& ent : matches) { 
     std::cout << ent << std::endl; 
    } 

    return 0; 
}

在這個在線演示：http://ideone.com/Zlhfua

跟進，以解決您的更改：

while(i < SIZE_VECTOR_ONE && j < SIZE_VECTOR_TWO) 
{ 
    if (ID_doc_one < ID_doc_two) 
    { 
     ID_doc_one = v1[++i].first;

比方說「SIZE_VECTOR 1」是1，這意味着，有一個元素在向量中，元素[0]。如果ID_doc_one是0並且ID_doc_two是1，則

if (0 < 1) { 
    ID_doc_one = v1[1].first;

這是無效的。你可能會關閉使用迭代器或指針更好：

while (oneIt != v1.end() && twoIt != v2.end()) { 
    if (oneIt->first < twoIt->first) { 
     ++oneIt; 
     continue; 
    } else if (*twoIt < *oneIt) { 
     ++twoIt; 
     continue; 
    } 
    // same documentId in both lists, snag positions. 
    ... 
}

下，這看起來有點破：

else { 
    } // To avoid "out of range" errors <-- but also ends the "else" 
     if (i < SIZE_VECTOR_ONE - 1) 
      ID_doc_one = v1[++i].first; 
     if (j < SIZE_VECTOR_TWO - 1) 
      ID_doc_two = v2[++j].first; 
    }

我不知道，如果你有相同的文檔，但在多個位置會發生什麼？

接下來的這位是挑剔的，但我花了很長的時間來解析

WordPosition_t pos_one = v1[i].second; 
    WordPosition_t pos_two = v2[j].second; 

    // The words make a phrase! Return pos_two for the next intersection finding step 
    if (pos_one + 1 == pos_two)

似乎大大清晰的寫本，你可能會說「（如果第二個字是在後的位置第一個字）：

WordPosition_t posFirstWord = v1[i].second; 
    WordPosition_t posSecondWord = v2[j].second; 

    // The words make a phrase! Return pos_two for the next intersection finding step 
    if (posSecondWord == posFirstWord + 1)

接下來的這個部分是一種令人困惑的，因爲這兩個條款似乎是爲了增加i和j和更新ID_doc_one和二，它會是有意義的那部分吊到一個共同的在if塊之後的部分，但是再次使用else {}很難說你實際上在做什麼。

if (pos_one + 1 == pos_two) 
    { 
     intersection.push_back(make_pair(ID_doc_one,pos_two)); 
     ID_doc_one = v1[++i].first; 
     ID_doc_two = v2[++j].first; 
    } 

    else { 
    } // To avoid "out of range" errors 
     if (i < SIZE_VECTOR_ONE - 1) 
      ID_doc_one = v1[++i].first; 
     if (j < SIZE_VECTOR_TWO - 1) 
      ID_doc_two = v2[++j].first; 
    }

當你匹配兩個數組，你總是希望增加雙方i和j，這不是調理，我也不知道爲什麼你正在使用pos_two，因爲這句話在pos_one居然發現？

這是我怎麼會寫它：

#include<iostream> 
#include<map> 
#include<vector> 
#include<string> 

typedef std::string   Word_t; 
typedef unsigned int  WordPosition_t; 
typedef unsigned int  IDdocument_t; 

typedef std::pair<IDdocument_t, WordPosition_t> DocumentPosition_t; 
typedef std::vector<DocumentPosition_t> WordReferences_t; 

WordReferences_t _intersect_two_words(const WordReferences_t& v1, const WordReferences_t& v2) 
{ 
    // all the locations where the words occur one after the other. 
    WordReferences_t intersection; 

    auto firstIt = v1.begin(); 
    auto secondIt = v2.begin(); 
    while (firstIt != v1.end() && secondIt != v2.end()) 
    { 
     if (firstIt->first < secondIt->first) 
     { 
      ++firstIt; 
      continue; 
     } 
     // find the second word in the same document and AFTER the first word. 
     if (secondIt->first < firstIt->first || secondIt->second < firstIt->second + 1) 
     { 
      ++secondIt; 
      continue; 
     } 
     // first word wasn't just before the second, it's not a phrase. 
     if (secondIt->second > firstIt->second + 1) 
     { 
      ++firstIt; 
      continue; 
     } 
     // We found a phrase. 
     intersection.emplace_back(*firstIt); 
     ++firstIt; 
     ++secondIt; 
    } 

    return intersection; 
} 

int main() 
{ 
    WordReferences_t v1, v2; 
    v1.push_back(std::make_pair(10, 5)); 
    v1.push_back(std::make_pair(10, 25)); 
    v1.push_back(std::make_pair(11, 10)); 
    v1.push_back(std::make_pair(12, 1)); 
    v1.push_back(std::make_pair(12, 11)); 
    v1.push_back(std::make_pair(12, 21)); 
    v1.push_back(std::make_pair(12, 31)); 
    v1.push_back(std::make_pair(15, 11)); 
    v1.push_back(std::make_pair(100, 1)); 
    v1.push_back(std::make_pair(100, 11)); 
    v1.push_back(std::make_pair(100, 21)); 
    v1.push_back(std::make_pair(101, 11)); 
    v1.push_back(std::make_pair(102, 11)); 
    v1.push_back(std::make_pair(102, 13)); 
    v1.push_back(std::make_pair(102, 14)); 
    v1.push_back(std::make_pair(103, 11)); 
    v1.push_back(std::make_pair(103, 13)); 

    v2.push_back(std::make_pair(10, 11)); 
    v2.push_back(std::make_pair(12, 10)); 
    v2.push_back(std::make_pair(12, 40)); 
    v2.push_back(std::make_pair(16, 11)); 
    v2.push_back(std::make_pair(100, 12)); // match 
    v2.push_back(std::make_pair(101, 12)); // match 
    v2.push_back(std::make_pair(101, 13)); 
    v2.push_back(std::make_pair(101, 14)); 
    v2.push_back(std::make_pair(102, 12)); //match 
    v2.push_back(std::make_pair(103, 1)); 
    v2.push_back(std::make_pair(103, 10)); 
    v2.push_back(std::make_pair(103, 12)); // match 
    v2.push_back(std::make_pair(103, 15)); 

    auto intersection = _intersect_two_words(v1, v2); 
    for (auto entry : intersection) 
    { 
     std::cout << entry.first << ", " << entry.second << "+" << (entry.second + 1) << std::endl; 
    } 

    return 0; 
}

活生生的例子：http://ideone.com/XRfhAI

來源

2013-06-28 02:54:00 kfsone

嘿，你介意看看我原來的帖子嗎？我發佈了我的解決方案。謝謝！ –

看到我的修改回覆。 – kfsone

謝謝@kfsone！我用我的新版代碼更新了我的帖子。 –