2015-12-20 76 views
0

我之前問過類似的問題,並認爲我有正確的答案,但後來意識到我正在捕獲一些我不應該的字符串。Javascript正則表達式忽略第一捕獲組

我想解析一個大的文本文件,並用正則表達式拉出某些元素。我爲我的網站使用Node,所以我在Javascript中執行此操作。

在下面的示例中,我試圖用逗號和句點匹配10個數字串。在第一個例子中,我匹配正確的模式,但我捕獲兩個外圍的字符串(我只想要以「4 0000 ....」開頭的行末尾的數字)。

https://regex101.com/r/nO8nM1/8

在這個例子中,我匹配字符串的適合的情況下,但我不能夠忽略第一個捕獲組,所以其他的字符和空格都包括在內。

https://regex101.com/r/uB6hE4/1

正則表達式:

/(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm 

樣本數據:

    23205  - Grants Current-County Operatin      4,425,327.00" 

" 4 0000047387   Central Equatoria State   1003-1478 Sta Hosp Oper Oct     85,784.00" 
" 4 0000047442   EASTERN EQUATORIA ST    1003-1479 Sta Hosp Oper Oct     93,137.00" 
" 4 0000047485   JONGLEI STATE     1003-1519 Sta Hosp Oper Oct     144,608.00" 
" 4 0000047501   Lakes State      1003-1482 Sta Hosp Oper Oct     93,137.00" 
" 4 0000047528   Unity State      1003-1484 Sta Hosp Oper Oct     75,980.00" 
" 4 0000047532   Northern Bahr-el State   1003-1483 Sta Hosp Oper Oct     58,824.00" 
" 4 0000047615   Western E State     1003-1488 Sta Hosp Oper Oct     93,137.00" 
" 4 0000047638   Warap State      1003-1486 Sta Hosp Oper Oct     51,471.00" 
" 4 0000047680   Upper Nile State     1003-1485 Capitation     102,941.00" 
" 4 0000047703   Western BG State     1003-1487 Sta Hosp Oper Oct     34,314.00" 
                          ---------------------- 
"  Total For Period   4                  833,333.00" 
---------------------------------------------------------------------------------------------------------------------------- 
Fiscal Year  2015/16        Republic Of South Sudan       Date  2015/11/20 
Period     5                    Time  12:58:40 
                FreeBalance Financial Management System     Page    7 
---------------------------------------------------------------------------------------------------------------------------- 
                  Vendor Analysis Report 

                   1091 Health (MOH) 
    Prd Voucher #   Vendor Name      Description       Amount 
    --- ---------------- ------------------------------ ----------------------------- ---------------------- 
                          ---------------------- 
" 

(\d+,\d+,\d+.\d+)(?=") 

正則表達式2:

/(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm 

在我的代碼我推這些值對象的數組如果他們在場。我只嘗試推送與我想要的相關的匹配組,但它只會從匹配中推送索引項。

我度過了困難時期嘗試的?:?=,並?!忽略第一個捕獲組中的第二個鏈接無濟於事幾種不同的組合。我覺得解決方案必須相當簡單,但我不能完全達到目的。任何想法我做錯了什麼?

我的代碼:

var openFile = function(event) { 
    var input = event.target; 
    var reader = new FileReader(); 
    reader.onload = function() { 
    var text = reader.result; 
    // console.log(text.substring(0, 999999999999999)); 
     var section = text.substring(0, 9999999999999999); 
     var subSection = []; 
     console.log(typeof subSection); 
     var masterArray = new Object(); 
     var uploadDate = ""; 
     var period = ""; 
     var transferArray = []; 
     var subSectionRegex =/ Total([\s\S]*?)Total|^\s+\d{4,5}([\s\S]*?)Total F/gm; 
     var transferCodeRegex = /[0-9]{4,5}/; 
     var voucherNumberRegex = /([0-9]{7,10}[\S])(?=\s+)/g; 
     var vendorRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)/gm; 
     var descriptionRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+\s(\s\S+)?)(?=\s+100)/g; 
     // var descriptionRegex = /(\d{4}-\d{4})(\D+)*\s\D/g; 
     var amountRegex = /(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm; 
     // var amountRegex = /(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm; 
     // var amountRegex = /\w\s{10,20}(\d+(?:,\d{3})*\.\d+)/gm; 
     var oneLineAmountRegex = /(\d+,\d+,\d+.\d+)|\d+,\d+.\d+/g; 
     var oneLineDescRegex =/- (\D+)|- \d+(\D+)/gm; 
     var allData = [{}]; 

     console.log('section: ' + typeof section); 
     subSection = section.match(subSectionRegex); 
     subSection = subSection.filter(Boolean); 
     console.log(typeof subSection); 

     function extractDate() { 
     uploadDate = section.match(/Date (.*)/)[1].trim(); 
     uploadDate = new Date(uploadDate); 
     allData["uploadDate"] = uploadDate; 
     } 
     extractDate(); 
     // console.log(allData.uploadDate); 

     function extractPeriod() { 
     period = section.match(/Period (.*)/)[1].trim(); 
     period = period.split(" "); 
     period = period[0]; 
     period = parseInt(period); 
     // console.log("period: " + period); 
     allData["period"] = period; 
     } 
     extractPeriod(); 
     // console.log(allData.period); 

     function extractDetails() { 
     for(var i = 0; i < subSection.length; i++) { 
      if(subSection[i].match(transferCodeRegex) && subSection[i].match(voucherNumberRegex) && subSection[i].match(vendorRegex) && subSection[i].match(descriptionRegex) && subSection[i].match(amountRegex)) { 
       transferArray.push({ 
        "transferCode": subSection[i].match(transferCodeRegex), 
        "details": [{ 
         "voucherNumber": subSection[i].match(voucherNumberRegex), 
         "vendor": subSection[i].match(vendorRegex), 
         "description": subSection[i].match(descriptionRegex), 
         "total": subSection[i].match(amountRegex) 
        }] 
       }) 
      } else { 
       transferArray.push({ 
        "transferCode": subSection[i].match(transferCodeRegex), 
        "details": [{ 
         "voucherNumber": subSection[i].match(voucherNumberRegex), 
         "description": subSection[i].match(oneLineDescRegex), 
         "total": subSection[i].match(oneLineAmountRegex) 
        }] 
       }) 
      } 
     } 
     } 

    function removeNulls(obj) { 
      var isArray = obj instanceof Array; 
      for(var k in obj) { 
       console.log('k: ' + k); 
      if(obj[k] === null || obj[k] === undefined) isArray ? obj.splice(k, 1) : delete obj[k]; 
      else if (typeof obj[k] === "object") removeNulls(obj[k]); 
      } 
     } 

     removeNulls(transferArray); 
     console.log(transferArray); 
     console.log(JSON.stringify(transferArray, null, 2)) 

     function cleanData() { 
      transferArray.forEach(function(e) { 
       console.log(e) 
      e.details.forEach(function(evt) { 
       console.log(evt) 
       console.log(evt.amount) 
       console.log(evt.description) 
      for(i = 0; i < evt.amount.length; i++) { 
       // evt.amount[i] = evt.amount[i].toString(); 
       // evt.amount[i] = evt.amount[i].replace(/^[a-zA-Z]\s+/g, ''); 
       evt.amount[i] = parseFloat(evt.amount[i].replace(/\,/g, "")); 
      } 
      for(i = 0; i < evt.description.length; i++) { 
      evt.description[i] = evt.description[i].toString(); 
      evt.description[i] = evt.description[i].trim(); 
      } 
      return(evt); 
      }) 
       // console.log(evt.amount); 
       // console.log(evt.description); 
      }); 
     } 
     cleanData(); 
     console.log(transferArray); 

     console.log(transferArray); 

      //adds detailed data to allData array 
     allData["section"] = transferArray; 

     extractDetails(); 
     console.log(allData); 

     function pushArrayToObject() { 

     } 
    }; 
    reader.readAsText(input.files[0]); 
}; 
+0

從你寫的是什麼,很明顯你正在試圖從JS得到一個回顧後正則表達式引擎** JS不支持lookbehind **。恕我直言,['(?:\ s \ w {3} \ s + | Capitation \ s +)(\ d +(?:,\ d {3})* \。\ d +)「'](https://regex101.com/r/uH9uI5/1)是這裏使用的最好的正則表達式。這個問題只有你的代碼....我不能在問題中看到。 –

+0

啊,對不起。我認爲它會顯示在鏈接中。我現在將在編輯中添加代碼。 –

回答

1

爲你做這項工作?

正如@stribizhev所述,JS不支持lookbehind。但是,你可以調整你的正則表達式來獲取之間封閉的一切「,那麼你所要做的一切就是捕獲單組返回

這是表達:

/^".*\s+([\d{1,3},]*\d{1,3}.\d{2})"$/; 

所以,你需要的地方得到totalAmount(假設這是你要找的值),你可以做這樣:

subSection[i].match(oneLineAmountRegex[1]) 
+1

感謝您的幫助。我在另一條評論中提到,任何使用匹配組的方法都行不通,因爲它會將單個字符串推送到我的模型中,而我需要它來推送數組。 我想我會把它與多餘的字符,並嘗試迭代,並用Javascript清理它,因爲我現在不能再看正則表達式。 –

+0

請注意''\ d {1,3},]'是一個字符類,而不是一個組。 – sln