* This is a heading 
    P1 Start a paragraph here but since it is the first indentation level 
the paragraph may have a lower indentation on the next line 
    or a greater one for that matter. 

    + LI1.1 I am beginning a list here 
    + LI1.2 Here begins another list item 
    which continues here 
     and also here 
    P2 but is broken here (this line becomes a paragraph 
    outside of the first list). 
    + LI2.1 P1 Second list item. 
    - LI2.1.1 Inner list with a simple item 
    - LI2.1.2 P1 and with an item containing several paragraphs. 
     Here is the second line in the item, and now 

     LI2.1.2 P2 I begin a new paragraph still in the same item. 
     The indentation can be only higher 
    LI2.1 P2 but if the indentation is lower, it breaks the item, 
    (and the whole list), and this is a paragraph in the LI2.1 
    list item 

    - LI 2.2.1 You get the picture 
    P3 Just plain text outside of the list. 





AFAIK,沒有簡單的方法來解析這個。這些類似維基的格式在@ $$中處理很痛苦。您是要手動編寫解析器,還是在編寫/翻譯語法,並讓解析器生成器爲您創建解析器? –


那麼,你的評論意味着我手工編碼,我已經開始做,但沒有找到正確的方法。也許寫一個語法會更容易,但我不知道如何處理重要的空格。我是新來的解析,所以我碰到了迄今爲​​止我嘗試過的所有東西。 :) – glmxndr


有沒有正式的語法寫出來的地方?這個問題在我看來,你沒有一個令牌來結束一個聲明。有幾種語言使用白色格式而不是分號和大括號,但我想不出任何讓你像P1的例子那樣格式化第一行後面的任何縮進程度。 – Samsdram











jison的+1。你只需要使用一個好的老的lex/yacc端口來進行解析。 – Raynos


@Raynos,好的,但是這並不能完全回答如何處理顯着的空白,尤其是當LI項目具有這樣的特性時,他們對於第一行和後續行具有不同的縮進(參見問題中的示例)。 – glmxndr


@subtenante [購買龍書](http://en.wikipedia.org/wiki/Compilers:_Principles,_Techniques,_and_Tools)。然後閱讀它。然後用你驚人的編譯器知識解決你的問題。 – Raynos


我喜歡解析器和編譯器理論,所以我寫了一個小解析器(手工),它能夠將您的示例代碼片段解析爲一個XML DOM Document對象。可以對其進行修改,以便生成其他類型的樹結構,如自定義AST(抽象語法樹)。



你的榜樣片斷作爲輸入,聲明result = new OrgModParser().parse(input); result.xml返回:

<org-mode-document indentLevel="-1"> 
    <section indentLevel="0"> 
     <header indentLevel="0">This is a heading</header> 
      <paragraph indentLevel="1">P1 Start a paragraph here but since it is the first indentation level the paragraph may have a lower indentation on the next line or a greater one for that matter.</paragraph> 
      <list indentLevel="1"> 
       <list-item indentLevel="1"> 
        <paragraph indentLevel="2">LI1.1 I am beginning a list here</paragraph> 
       <list-item indentLevel="1"> 
        <paragraph indentLevel="2">LI1.2 Here begins another list item which continues here and also here</paragraph> 
     <paragraph indentLevel="1">P2 but is broken here (this line becomes a paragraph outside of the first list).</paragraph> 
     <list indentLevel="1"> 
      <list-item indentLevel="1"> 
       <paragraph indentLevel="2">LI2.1 P1 Second list item.</paragraph> 
       <list indentLevel="2"> 
        <list-item indentLevel="2"> 
         <paragraph indentLevel="3">LI2.1.1 Inner list with a simple item</paragraph> 
        <list-item indentLevel="2"> 
         <paragraph indentLevel="3">LI2.1.2 P1 and with an item containing several paragraphs. Here is the second line in the item, and now</paragraph> 
         <paragraph indentLevel="3">LI2.1.2 P2 I begin a new paragraph still in the same item. The indentation can be only higher</paragraph> 
       <paragraph indentLevel="2">LI2.1 P2 but if the indentation is lower, it breaks the item, (and the whole list), and this is a paragraph in the LI2.1 list item</paragraph> 
       <list indentLevel="2"> 
        <list-item indentLevel="2"> 
         <paragraph indentLevel="3">LI2.2.1 You get the picture</paragraph> 
     <paragraph indentLevel="1">P3 Just plain text outside of the list.</paragraph> 


* File: orgmodparser.js 
* Basic usage: var object = new OrgModeParser().parse(input); 
* Works on: JScript and JScript.Net. 
* - For other JavaScript platforms, just replace or override the .createRoot() method 

OrgModeParser = function (options) { 
    if (typeof options == "object") { 
     for (var i in options) { 
      this[i] = options[i]; 

OrgModeParser.prototype = { 

    "INDENT_WIDTH" : 2, // Two spaces 
    "LINE_SEPARATOR" : "\r\n", 

    * Each line in the input will be matched against this regexp. 
    * Only spaces are allowed as indentation characters. 
    * The symbols '*', '+' and '-' will be recognized, but only if they are followed by at least one space. 
    * Add other symbols in this regexp if you want the parser to recognize them 
    "re" : /^(*)([\+\-\*] +)?(.*)/, 

    // This function must return a valid XML DOM document object 
    createRoot : function() { 
     var err, progIDs = ["Msxml2.DOMDocument.6.0", "Msxml2.DOMDocument.5.0", "Msxml2.DOMDocument.4.0", "Msxml2.DOMDocument.3.0", "Msxml2.DOMDocument.2.0", "Msxml2.DOMDocument.1.0", "Msxml2.DOMDocument"]; 
     for (var i = 0; i < progIDs.length; i++) { 
      try { 
       return new ActiveXObject(progIDs[i]); 
      catch (err) { 
     alert("Org-mode parser - Error - Failed to instantiate root object"); 
     return null; 

    parse : function (text) { 

     function createNode (tagName, text) { 
      var node = root.createElement(tagName); 
      node.setAttribute("indentLevel", level); 
      if (text) { 
       var textNode = root.createTextNode(text); 
      return node; 

     function getContainer() { 
      if (lastNode.tagName == "section") { return lastNode; } 
      var anc = lastNode.parentNode; 
      while (anc) { 
       if (modifier == "+" || modifier == "-") { 
        if (anc.getAttribute("indentLevel") == level && anc.tagName == "list") { return anc; } 
       if (anc.getAttribute("indentLevel") < level && anc.tagName != "paragraph") { return anc; } 
       anc = anc.parentNode; 
      alert("Org-mode parser - Internal error at line: "+i);return null; 

     if (typeof text != "string") { alert("Org-mode - Type error - Input must be of type 'string'"); return null; } 

     var body; 
     var content;  // The text of the current line, without its indentation and modifier 
     var lastNode; // The node being processed 
     var indent;  // The indentation of the current line 
     var isAfterDubbleLineBreak; // Indicates if the current line follows a dubble line break 
     var line;  // The current line being processed 
     var level;  // The current indentation level; given by indent.length/this.INDENT_WIDTH. Not to confuse with the nesting level 
     var lines;  // Array. Empty lines are included. 
     var match; 
     var modifier; // This can be "*", "+", "-" or "" 
     var root; 

     isAfterDubbleLineBreak = false; 
     level = -1;  // Indentation level is -1 initially; it will be 0 for the first "*"-bloc 
     lines = text.split(this.LINE_SEPARATOR); 
     root = this.createRoot(); 
     body = root.appendChild(createNode("org-mode-document")); 
     lastNode = body; 

     for (var i = 0; i < lines .length; i++) { 
      line = lines[i]; 
      match = line.match(this.re); 
      if (match === null) { alert("org-mode parse error at line: " + i); return null; } 
      indent = match[1]; 
      level = indent.length/this.INDENT_WIDTH; 
      modifier = match[2] && match[2].charAt(0); 
      content = match[3]; 

      // These conditions tell the parser what to do when encountering a line with a given modifer 
      if (content === "") { dubbleLineBreak(); continue; } 
      else if (modifier == "+" || modifier == "-") { plus(); } 
      else if (modifier == "*") { star(); } 
      else if (modifier == "+") { plus(); } 
      else if (modifier == "-") { minus(); } 
      else if (modifier == "") { noModifier(); } 
      isAfterDubbleLineBreak = false; 
     return root; 

     function star() { 
      // The '*' modifier is not allowed on an indented line 
      if (indent) { alert("Org-mode parse error: unexpected '*' symbol at line " + i); return null; } 
      lastNode = body.appendChild(createNode("section")); 
      // The div remains the current node 
      lastNode.appendChild(createNode("header", content)); 

     function plus() { 
      var container = getContainer(); 
      var tn = container.tagName; 
      if (tn == "section" || tn == "list-item") { 
       lastNode = container.appendChild(createNode("list")); 
       lastNode = lastNode.appendChild(createNode("list-item")); 
       lastNode = lastNode.appendChild(createNode("paragraph", content)); 
      } else if (tn == "list") { 
       lastNode = container.appendChild(createNode("list-item")); 
       lastNode = lastNode.appendChild(createNode("paragraph", content)); 
      else alert("Org-mode parser - Internal error - Bad container tag name: " + tn); 
      lastNode.setAttribute("indentLevel", Number(lastNode.getAttribute("indentLevel")) + 1); 

     function minus() { plus(); } 

     function noModifier() { 
      if (lastNode.tagName == "paragraph" && !isAfterDubbleLineBreak && (lastNode.getAttribute("indentLevel") == 1 || level >= lastNode.getAttribute("indentLevel"))) { 
       lastNode.childNodes[0].appendData(" " + content); 
      } else { 
       var container = getContainer(); 
       lastNode = container.appendChild(createNode("paragraph", content)); 

     function dubbleLineBreak() { 
      while (lines[i+1] && /^\s*$/.test(lines[i+1])) { i++; } 
      isAfterDubbleLineBreak = true; 


不錯,這是一個更好的地方,因爲它(幾乎)工作,因爲我期望:)),但我很不情願依靠DOM,首先是因爲我期望在瀏覽器外使用解析器,其次是因爲org-mode的一些其他功能不能像原樣那樣插入到DOM模型中。 (我幾乎和我所期待的一樣,因爲LI2.1.2應該有兩個段落:在雙重新行中打破了標記。) – glmxndr


感謝您的反饋。我已經更正了段落的配音新換行符錯誤,並將輸出類型更改爲XML對象,以便它不需要瀏覽器即可工作。 – Luc125


謝謝。你會發現我很煩人,但依賴於Windows ActiveX並不比依靠瀏覽器好得多...... :)但我明白你的建議的要點。 – glmxndr

