使用stringstream標記一個具有不同的分隔符的字符串

如何使用stringstream標記一個看起來像這樣的行。使用stringstream標記一個具有不同的分隔符的字符串

[標籤]操作碼[ARG1] [，ARG2]

的標籤可能並不總是存在，但如果不是，會有一個空白。操作碼總是存在，操作碼和arg1之間有空格或製表符。然後在arg1和arg2之間沒有空格，但是它被逗號分隔。

此外，一些空白行上將有空白，所以他們需要被丟棄。「＃」是註釋

因此，例如：

#Sample Input 
TOP NoP 
    L 2,1 
VAL INT 0

這僅僅是一個文本文件，我會從被閱讀的一個例子。所以在第一行的標籤中會是TOP，而opcode會= NOP，沒有參數被傳遞。

我一直在努力，但我需要一個更簡單的方式來標記和從我所見過的，stringstream似乎是我想要使用的，所以如果任何人都可以告訴我類的如何做這個，我真的很感激它。

我已經費盡瞭如何做到這一點我的大腦，只是告訴你，我不只是要求沒有工作，這是我當前的代碼：

int counter = 0; 
int i = 0; 
int j = 0; 
int p = 0; 

while (getline(myFile, line, '\n')) 
{ 


    if (line[0] == '#') 
    { 
     continue; 
    } 

    if (line.length() == 0) 
    { 
     continue; 
    } 

    if (line.empty()) 
    { 
     continue; 
    } 

    // If the first letter isn't a tab or space then it's a label 

    if (line[0] != '\t' && line[0] != ' ') 
    { 

     string delimeters = "\t "; 

     int current; 
     int next = -1; 


     current = next + 1; 
     next = line.find_first_of(delimeters, current); 
     label = line.substr(current, next - current); 

     Symtablelab[i] = label; 
     Symtablepos[i] = counter; 

     if(next>0) 
     { 
      current = next + 1; 
      next = line.find_first_of(delimeters, current); 
      opcode = line.substr(current, next - current); 


      if (opcode != "WORDS" && opcode != "INT") 
      { 
       counter += 3; 
      } 

      if (opcode == "INT") 
      { 
       counter++; 
      } 

      if (next > 0) 
      { 
       delimeters = ", \n\t"; 
       current = next + 1; 
       next = line.find_first_of(delimeters, current); 
       arg1 = line.substr(current, next-current); 

       if (opcode == "WORDS") 
       { 
        counter += atoi(arg1.c_str()); 
       } 
      } 

      if (next > 0) 
      { 
       delimeters ="\n"; 
       current = next +1; 
       next = line.find_first_of(delimeters,current); 
       arg2 = line.substr(current, next-current); 

      } 
     } 

     i++; 

    } 

    // If the first character is a tab or space then there is no label and we just need to get a counter 
    if (line[0] == '\t' || line[0] == ' ') 
    { 
     string delimeters = "\t \n"; 
     int current; 
     int next = -1; 
     current = next + 1; 
     next = line.find_first_of(delimeters, current); 
     label = line.substr(current, next - current); 

    if(next>=0) 
     { 
      current = next + 1; 
      next = line.find_first_of(delimeters, current); 
      opcode = line.substr(current, next - current); 

      if (opcode == "\t" || opcode =="\n"|| opcode ==" ") 
      { 
       continue; 
      } 

      if (opcode != "WORDS" && opcode != "INT") 
      { 
       counter += 3; 
      } 

      if (opcode == "INT") 
      { 
       counter++; 
      } 


      if (next > 0) 
      { 
       delimeters = ", \n\t"; 
       current = next + 1; 
       next = line.find_first_of(delimeters, current); 
       arg1 = line.substr(current, next-current); 

       if (opcode == "WORDS") 
       { 
        counter += atoi(arg1.c_str()); 
       } 

      } 



      if (next > 0) 
      { 
       delimeters ="\n\t "; 
       current = next +1; 
       next = line.find_first_of(delimeters,current); 
       arg2 = line.substr(current, next-current); 

      } 
     } 

    } 
} 

myFile.clear(); 
myFile.seekg(0, ios::beg); 

while(getline(myFile, line)) 
{ 
    if (line.empty()) 
    { 
     continue; 
    } 

    if (line[0] == '#') 
    { 
     continue; 
    } 

    if (line.length() == 0) 
    { 
     continue; 
    } 



    // If the first letter isn't a tab or space then it's a label 

    if (line[0] != '\t' && line[0] != ' ') 
    { 

     string delimeters = "\t "; 

     int current; 
     int next = -1; 


     current = next + 1; 
     next = line.find_first_of(delimeters, current); 
     label = line.substr(current, next - current); 


     if(next>0) 
     { 
      current = next + 1; 
      next = line.find_first_of(delimeters, current); 
      opcode = line.substr(current, next - current); 



      if (next > 0) 
      { 
       delimeters = ", \n\t"; 
       current = next + 1; 
       next = line.find_first_of(delimeters, current); 
       arg1 = line.substr(current, next-current); 

      } 

      if (next > 0) 
      { 
       delimeters ="\n\t "; 
       current = next +1; 
       next = line.find_first_of(delimeters,current); 
       arg2 = line.substr(current, next-current); 

      } 
     } 

     if (opcode == "INT") 
     { 
      memory[p] = arg1; 
      p++; 
      continue; 
     } 

     if (opcode == "HALT" || opcode == "NOP" || opcode == "P_REGS") 
     { 
      memory[p] = opcode; 
      p+=3; 
      continue; 
     } 

     if(opcode == "J" || opcode =="JEQR" || opcode == "JNE" || opcode == "JNER" || opcode == "JLT" || opcode == "JLTR" || opcode == "JGT" || opcode == "JGTR" || opcode == "JLE" || opcode == "JLER" || opcode == "JGE" || opcode == "JGER" || opcode == "JR") 
     { 
      memory[p] = opcode; 
      memory[p+1] = arg1; 
      p+=3; 
      continue; 
     } 

     if (opcode == "WORDS") 
     { 
      int l = atoi(arg1.c_str()); 
      for (int k = 0; k <= l; k++) 
      { 
       memory[p+k] = "0"; 
      } 

      p+=l; 
      continue; 
     } 

     else 
     { 
      memory[p] = opcode; 
      memory[p+1] = arg1; 
      memory[p+2] = arg2; 
      p+=3; 
     } 

    } 

    // If the first character is a tab or space then there is no label and we just need to get a counter   


    if (line[0] == '\t' || line[0] == ' ') 
    { 
     string delimeters = "\t "; 
     int current; 
     int next = -1; 
     current = next + 1; 
     next = line.find_first_of(delimeters, current); 
     label = line.substr(current, next - current); 

    if(next>=0) 
     { 
      current = next + 1; 
      next = line.find_first_of(delimeters, current); 
      opcode = line.substr(current, next - current); 

      if (opcode == "\t" || opcode =="\n"|| opcode ==" "|| opcode == "") 
      { 
       continue; 
      } 



      if (next > 0) 
      { 
       delimeters = ", \n\t"; 
       current = next + 1; 
       next = line.find_first_of(delimeters, current); 
       arg1 = line.substr(current, next-current); 

      } 



      if (next > 0) 
      { 
       delimeters ="\n\t "; 
       current = next +1; 
       next = line.find_first_of(delimeters,current); 
       arg2 = line.substr(current, next-current); 

      } 
     } 

     if (opcode == "INT") 
     { 
      memory[p] = arg1; 
      p++; 
      continue; 
     } 

     if (opcode == "HALT" || opcode == "NOP" || opcode == "P_REGS") 
     { 
      memory[p] = opcode; 
      p+=3; 
      continue; 
     } 

     if(opcode == "J" || opcode =="JEQR" || opcode == "JNE" || opcode == "JNER" || opcode == "JLT" || opcode == "JLTR" || opcode == "JGT" || opcode == "JGTR" || opcode == "JLE" || opcode == "JLER" || opcode == "JGE" || opcode == "JGER" || opcode == "JR") 
     { 
      memory[p] = opcode; 
      memory[p+1] = arg1; 
      p+=3; 
      continue; 
     } 

     if (opcode == "WORDS") 
     { 
      int l = atoi(arg1.c_str()); 
      for (int k = 0; k <= l; k++) 
      { 
       memory[p+k] = "0"; 
      } 

      p+=l; 

      continue; 
     } 

     else 
     { 
      memory[p] = opcode; 
      memory[p+1] = arg1; 
      memory[p+2] = arg2; 
      p+=3; 
     } 
    } 
}

我顯然希望使這要好得多，所以任何幫助將不勝感激。

來源

2012-09-18 cadavid4j

如果stringstream真的沒有執行，那麼我會建議你使用這個答案的參考 http://stackoverflow.com/a/53863/1410711 – Recker

鑑於這種複雜的輸入，你幾乎肯定想要開始思考就詞法分析器和可能的解析器而言。一些可能性包括Flex/byacc或Boost Spirit/Qi（儘管肯定會有更多）。 –

我可以使用字符串流來完成此任務嗎？ Boost tokenizer是我現在無法使用的東西。 – cadavid4j

在你爲維護這些巨大的if statemenets或試圖學習Boost Spirit而發瘋之前，讓我們嘗試編寫一個非常簡單的解析器。這是一個很長的帖子，，並沒有直接指向這一點，所以請忍受我。

首先，我們需要一個語法，這似乎是死的簡單：

line 
      label(optional) opcode argument-list(optional) 

    argument-list 
      argument 
      argument, argument-list

英文：一行代碼包含一個可選標籤，操作碼和可選參數列表。參數列表可以是單個參數（整數），也可以是後跟分隔符（逗號）和另一個參數列表的參數。

我們首先定義兩個數據結構。標籤應該是唯一的（對吧？），所以我們將有一組字符串，以便我們可以隨時查看它們，並且如果我們找到重複的標籤，可能會報告錯誤。下一個是到size_t的字符串映射，它作爲有效操作碼的符號表以及每個操作碼的參數的預期數量。

std::set<std::string> labels; 
std::map<std::string, size_t> symbol_table = { 
    { "INT", 1}, 
    { "NOP", 0}, 
    { "L", 2} 
};

我不知道究竟是什麼在你的代碼memory，但你的計算偏移到哪把論點的方式似乎unneccesarily複雜。我們來定義一個可以優雅地保存一行代碼的數據結構。我會做這樣的事情：

typedef std::vector<int> arg_list; 

struct code_line { 
    code_line() : label(), opcode(), args() {} 
    std::string label;  // labels are optional, so an empty string 
          // will mean absence of label 
    std::string opcode;  // opcode, doh 
    arg_list  args;  // variable number of arguments, it can be empty, too. 
          // It needs to match with opcode, we'll deal with 
          // that later 
};

語法錯誤，是種特殊情況，這不是容易恢復的，所以讓我們通過拋出異常對付他們。我們的簡單例外類可能如下所示：

struct syntax_error { 
    syntax_error(std::string m) : msg(m) { } 
    std::string msg; 
};

令牌化，lexing和解析是常見的分離任務。但我想這個簡單的例子，我們可以在一個類中組合詞法分析器和詞法分析器。我們已經知道我們的語法元素是由哪些元素組成的，因此讓我們編寫一個類，它將輸入爲文本並從中提取語法元素。接口看起來是這樣的：

class token_stream { 
    std::istringstream stream; // stringstream for input 
    std::string buffer;  // a buffer for a token, more on this later 
public: 
    token_stream(std::string str) : stream(str), buffer() { } 

    // these methods are self-explanatory 
    std::string get_label(); 
    std::string get_opcode(); 
    arg_list get_arglist(); 

    // we're taking a kind of top-down approach with this, 
    // so let's forget about implementations for now 
};

而工作的馬，它試圖使標記的意義，如果一切順利返回code_line結構的函數：

code_line parse(std::string line) 
{ 
    code_line temp; 
    token_stream stream(line); 

    // Again, self-explanatory, get a label, opcode and argument list from 
    // token stream. 

    temp.label = stream.get_label(); 
    temp.opcode = stream.get_opcode(); 
    temp.args = stream.get_arglist(); 

    // Everything went fine so far, remember we said we'd be throwing exceptions 
    // in case of syntax errors. 

    // Now we can check if we got the correct number of arguments for the given opcode: 

    if (symbol_table[temp.opcode] != temp.args.size()) { 
     throw syntax_error("Wrong number of parameters."); 
    } 

    // The last thing, if there's a label in the line, we insert it in the table. 
    // We couldn't do that inside the get_label method, because at that time 
    // we didn't yet know if the rest of the line is sintactically valid and a 
    // exception thrown would have left us with a "dangling" label in the table. 

    if (!temp.label.empty()) labels.insert(temp.label); 

    return temp; 
}

下面是我們怎樣才能使用這一切：

int main() 
{ 
    std::string line; 
    std::vector<code_line> code; 

    while (std::getline(std::cin, line)) { 

     // empty line or a comment, ignore it 
     if (line.empty() || line[0] = '#') continue; 

     try { 
      code.push_back(parse(line)); 
     } catch (syntax_error& e) { 
      std::cout << e.msg << '\n'; 

      // Give up, try again, log... up to you. 
     } 
    } 
}

如果輸入的是succesfuly解析，我們現在得到的所有的信息有效行（標籤數量的參數）的載體，可以做公關很多我們喜歡的東西。此代碼將比IMO更容易維護和擴展。例如，如果您需要引入新的操作碼，則只需在地圖上再添加一個條目即可（symbol_table）。與你的if陳述相比，這是怎麼回事？ :)

唯一剩下的就是token_stream s方法的實際實現。以下是我這樣做是爲了get_label：

std::string token_stream::get_label() 
{ 
    std::string temp; 

    // Unless the stream is empty (and it shouldn't be, we checked that in main), 
    // operator>> for std::string is unlikely to fail. It doesn't hurt to be robust 
    // with error checking, though 

    if (!(stream >> temp)) throw ("Fatal error, empty line, bad stream?"); 

    // Ok, we got something. First we should check if the string consists of valid 
    // characters - you probably don't want punctuation characters and such in a label. 
    // I leave this part out for simplicity. 

    // Since labels are optional, we need to check if the token is an opcode. 
    // If that's the case, we return an empty (no) label. 

    if (symbol_table.find(temp) != symbol_table.end()) { 
     buffer = temp; 
     return ""; 
    } 

    // Note that above is where that `buffer` member of token_stream class got used. 
    // If the token was an opcode, we needed to save it so get_opcode method can make 
    // use of it. The other option would be to put the string back in the underlying 
    // stringstream, but that's more work and more code. This way, get_opcode needs 
    // to check if there's anything in buffer and use it, or otherwise extract from 
    // the stringstream normally. 

    // Check if the label was used before: 

    if (labels.count(temp)) 
     throw syntax_error("Label already used."); 

    return temp; 
}

就是這樣。我將其餘的實施作爲練習留給你。希望它有幫助。 :)

來源

2012-09-19 10:18:53 jrok

你一定需要正則表達式，比如boost regex;或詞彙分析和解析工具，例如lex/yacc，flex/bison或助力精神。

在字符串和數據流中保持這種複雜性並不值得。

來源

2012-09-18 00:23:34

我不知道如何使用提升。每當我把#include 我得到這個錯誤'boost/tokenizer.hpp：沒有這樣的文件或目錄編譯終止。 ' – cadavid4j

http://www.boost.org/doc/libs/1_51_0/more/getting_started/index.html - 然後選擇適用於您的平臺的入門指南。當然，找到你想要使用的boost庫的頁面。 –

如果我嘗試在另一臺機器上編譯此源代碼，它會編譯？如果不是，那麼提振就不存在了。 – cadavid4j

使用stringstream標記一個具有不同的分隔符的字符串

回答

相關問題