2017-07-12 40 views
0

我有一個非常大的文件(55千兆字節的json數據)。我正在使用ifstream來讀取,而另一個ofstream要寫入另一個文件。該程序運行正確一段時間。然後由於內存使用量過大而崩潰。讀寫大文件C++(內存過載)

我試着用ignoreclear清除輸入緩衝區。 並嘗試使用flush清除輸出緩衝區。

此外,該文件是非常巨大的,所以,我希望它是快速的。

p.s.我半睡着時寫了json解析器。所以請原諒我糟糕的解析器代碼。也許內存泄漏出現在那裏。任何幫助將不勝感激。

小例子

int main() 
{ 
    std::ifstream file("aggressive_dedup.json", std::ifstream::in); 
    std::ofstream outFile("processed.json", std::ofstream::out); 
    std::string str; 
    int a; 
    long long count = 0; 


    while (std::getline(file, str)) 
    { 

     JsonParserStateMachine jsonParserStateMachine; 
     for(char &c : str) jsonParserStateMachine.changeState(c); 
     //std::cout<<jsonParserStateMachine.getReview(); 
     //This lines just gives a string to the output which is around may be 1000 characters 
     outFile << jsonParserStateMachine.getReview(); 
     if(++count % 1000 == 0) { 
      std::cout<<count<<" Processed\n"; 
      outFile.flush(); 
      return 0; 
     } 
    } 
    outFile.close(); 
    return 0; 
} 

對於那些誰願意看到整個代碼

#include <fstream> 
#include <string> 
#include <iostream> 

enum state { 
    q0, q1, q2, q3, q4, q5, q6, h 
}; 

class KeyValueStore{ 
    std::string *keys; 
    std::string *values; 
    int currentKeyPosition; 
    int currentValuePosition; 
    int maxLength; 
public: 
    KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0) 
    { 
     this->keys = new std::string[length]; 
     this->values = new std::string[length]; 

     for(int i=0;i<length;i++) 
     { 
      this->keys[i] = ""; 
      this->values[i] = ""; 
     } 

    } 

    void updateKeyPosition() 
    { 
     this->currentKeyPosition = this->currentKeyPosition++%9; 
    } 
    void updateValuePosition() 
    { 
     this->currentValuePosition = this->currentValuePosition++%9; 
    } 

    void putKey(char c) 
    { 
     this->keys[currentKeyPosition] += c; 
    } 
    void putValue(char c) 
    { 
     this->values[currentValuePosition] += c; 
    } 


    std::string getValue(std::string key) 
    { 
     for(int i=0;i<this->maxLength;i++) 
     { 
      if(this->keys[i] == key) return this->values[i]; 
     } 
     return ""; 
    } 

    void print() 
    { 
     std::cout<<"Keys"<<"\t"<<"Values"<<std::endl; 
     for(int i=0;i<maxLength;i++) 
     { 
      std::cout<<this->keys[i] <<'\t'<<this->values[i]<<std::endl; 
     } 
    } 


    std::string getReview() 
    { 
     return std::string("{\"" + this->getValue("reviewText") + "\":\"" + this->getValue("overall") + "\"}"); 
    } 
}; 



class JsonParserStateMachine{ 
    state currentState; 
    KeyValueStore keyValueStore; 
    bool inNum; 
    bool inArray; 
public: 
    JsonParserStateMachine(): keyValueStore(9), currentState(state::q0), inNum(false),inArray(false){} 

    state getState() 
    { 
     return this->currentState; 
    } 

    void print() 
    { 
     keyValueStore.print(); 
    } 


    std::string getReview() 
    { 
     return keyValueStore.getReview(); 
    } 

    state changeState(char c) 
    { 
     switch(currentState) 
     { 
      case state::q0: 
       if(c == ' ') break; 
       else if(c == '{') this->currentState = state::q1; 
       else this->currentState = state::h; 
       break; 
      case state::q1: 
       if(c == ' ') break; 
       else if(c == '\"') this->currentState = state::q2; 
       else this->currentState = state::h; 
       break; 
      case state::q2: 
       if(c == '\"'){ 
        this->currentState = state::q3; 
        this->keyValueStore.updateKeyPosition(); 
        break; 
       } 
       else{ 
        this->keyValueStore.putKey(c); 
        break; 
       } 
      case state::q3: 
       if(c == ':') this->currentState = state::q4; 
       else if(c == ' ') { 

       } 
       else { 
        //std::cout<<"From Q3"<<std::endl; 
        this->currentState = state::h; 
       }break; 
      case state::q4: 
       if(c == '\"' || c == '[') { 
        this->currentState = state::q5; 
        inArray = c == '[' ? true: false; 

       }else if(c == ' ') break; 
       else { 
        //std::cout<<"Got Here"<<std::endl; 
        inNum = true; 
        this->currentState = state::q5; 
        this->keyValueStore.putValue(c); 
       } 
       break; 
      case state::q5: 
       if(c == '\"' || c == ']'){ 
        this->currentState = state::q6; 
        this->keyValueStore.updateValuePosition(); 
        inArray = c == ']'? false: true; 
        break; 
       }else if(inNum && c == ','){ 
        this->currentState = state::q1; 
        this->keyValueStore.updateValuePosition(); 
        inNum = false; 
       } 
       else{ 
        this->keyValueStore.putValue(c); 
        break; 
       } 
      case state::q6: 
       if(c == ','){ 
        this->currentState = state::q1; 
        break; 
       }else if(c == ' '){ 
        break; 
       }else{ 
        //std::cout<<"From Q6"<<std::endl; 
        this->currentState = state::h; 
       } 
     } 

     return this->currentState; 
    } 
}; 

class Review{ 

    std::string reviewText; 
    int overall; 
    std::string summary; 
public: 
    void pusReviewText(std::string reviewText) 
    { 
     this->reviewText = reviewText; 
    } 

    void putOverall(int overall) 
    { 
     this->overall = overall; 
    } 


    void putSummary(std::string summary) 
    { 
     this->summary = summary; 
    } 

    std::string getReviewText() 
    { 
     return this->reviewText; 
    } 
    int getOverall() 
    { 
     return this->overall; 
    } 
    std::string getSummary() 
    { 
     return this->summary; 
    } 
}; 

int main() 
{ 
    std::ifstream file("aggressive_dedup.json", std::ifstream::in); 
    std::ofstream outFile("processed.json", std::ofstream::out); 
    std::string str; 
    int a; 
    long long count = 0; 


    while (std::getline(file, str)) 
    { 

     JsonParserStateMachine jsonParserStateMachine; 
     for(char &c : str) jsonParserStateMachine.changeState(c); 
     //std::cout<<jsonParserStateMachine.getReview(); 
     outFile << jsonParserStateMachine.getReview(); 
     if(++count % 1000 == 0) { 
      std::cout<<count<<" Processed\n"; 
      outFile.flush(); 
      return 0; 
     } 
    } 
    outFile.close(); 
    return 0; 
} 
+0

我還搜索了stackoverflow,所有問題都建議刷新,並忽略 –

+0

您的'KeyValueStore'需要一個析構函數。 – Frank

+0

@Frank讓我現在試試.. –

回答

2

的問題來自於您的KeyValueStore類:

KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0) 
{ 
    this->keys = new std::string[length]; 
    this->values = new std::string[length]; 
    ... 

什麼事也沒刪除這些陣列。在析構函數中刪除它們是簡單的修復:

~KeyValueStore() { 
    delete[] this->keys; 
    delete[] this->values; 
} 

但是!你真的應該考慮使用std::vector<std::string>來代替。或者甚至更好,重建整個東西圍繞std::unordered_map<std::string, std::string> instead.

+0

我不打算使用STL –

+0

謝謝。有效。我是這樣一個白癡 –

+1

那麼你爲什麼使用fstreams? STL的很多部分可能會有問題,但如果fstreams是公平的遊戲,則不可能有合理的理由避免使用std :: vector。哎呀,std :: string很容易成爲令人反感的部分,並且你正在使用它們很多。 – Frank