2013-11-01 28 views
1

不同我經歷了很多籃球的躍升得到這個字符串:rubular結果從IRB

"<html>\n<head>\n<script language=\"JavaScript\"> \n\n  //////////////////////////////////////////////////////////////// \n  // This [base64 encoder and decoder] was written by Tyler Akins and has been placed in the \n  // public domain. It would be nice if you left this header intact. \n  // Base64 code from Tyler Akins -- http://rumkin.com \n  var keyStr = \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/=\"; \n\n  function encode64(input) { \n   var output = \"\"; \n   var chr1, chr2, chr3; \n   var enc1, enc2, enc3, enc4; \n   var i = 0; \n\n   do { \n    chr1 = input.charCodeAt(i++); \n    chr2 = input.charCodeAt(i++); \n    chr3 = input.charCodeAt(i++); \n\n    enc1 = chr1 >> 2; \n    enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); \n    enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); \n    enc4 = chr3 & 63; \n\n    if (isNaN(chr2)) { \n     enc3 = enc4 = 64; \n    } else if (isNaN(chr3)) { \n     enc4 = 64; \n    } \n\n    output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) + \n     keyStr.charAt(enc3) + keyStr.charAt(enc4); \n   } while (i < input.length); \n\n   return output; \n  } \n  function decode64(input) { \n   var output = \"\"; \n   var chr1, chr2, chr3; \n   var enc1, enc2, enc3, enc4; \n   var i = 0; \n\n   // remove all characters that are not A-Z, a-z, 0-9, +, /, or = \n   input = input.replace(/[^A-Za-z0-9\\+\\/\\=]/g, \"\"); \n\n   do { \n    enc1 = keyStr.indexOf(input.charAt(i++)); \n    enc2 = keyStr.indexOf(input.charAt(i++)); \n    enc3 = keyStr.indexOf(input.charAt(i++)); \n    enc4 = keyStr.indexOf(input.charAt(i++)); \n\n    chr1 = (enc1 << 2) | (enc2 >> 4); \n    chr2 = ((enc2 & 15) << 4) | (enc3 >> 2); \n    chr3 = ((enc3 & 3) << 6) | enc4; \n\n    output = output + String.fromCharCode(chr1); \n\n    if (enc3 != 64) { \n     output = output + String.fromCharCode(chr2); \n    } \n    if (enc4 != 64) { \n     output = output + String.fromCharCode(chr3); \n    } \n   } while (i < input.length); \n\n   return output; \n  } \n\n  // end of Tyler Akins' code \n  //////////////////////////////////////////////////////////////// \n function escapePluses(s) { \n  return s.replace(/\\+/g, \"%2B\"); \n } \n function getFragment(thisuri) { \n  var pound = thisuri.indexOf(\"#\"); \n  if (pound == -1) { \n   return null; \n  } else { \n   return thisuri.substr(pound + 1); \n  } \n } \n function saveFragment() { \n  var fragment = getFragment(document.URL); \n  if (fragment != null) { \n   var pre_marker = \"&aka_frag=\"; \n   var g_req = decode64(document.relay.pubcookie_g_req.value); \n   var header_end = g_req.indexOf(pre_marker) + pre_marker.length; \n   var req_head = g_req.substr(0,header_end); \n   var req_foot = g_req.substr(header_end); \n   if ((req_foot.length > 0) && (req_foot.charAt(0) != '&')) { \n    req_foot = req_foot.substr(req_foot.indexOf(\"&\")); \n   } \n   var new_req = req_head + escapePluses(encode64(fragment)) + req_foot; \n   document.relay.pubcookie_g_req.value = encode64(new_req); \n  } \n } \n\n function doStuff() { \n  saveFragment(); \n  document.relay.submit(); \n } \n\n// setTimeout('doStuff()', 1000); \n</script></head>\n<body onLoad=\"doStuff()\">\n<form method=post action=\"https://weblogin.server.com/\" name=relay>\n<input type=hidden name=pubcookie_g_req value=\"b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA==\">\n<input type=hidden name=post_stuff value=\"\">\n<input type=hidden name=relay_url value=\"https://siam-pro.qa.server.com/PubCookie.reply\">\n<noscript>\n<p align=center>You do not have Javascript turned on, please click the button to continue.\n<p align=center>\n<input type=submit name=go value=Continue>\n</noscript>\n</form>\n</html>\n" 

的話,我想這個字符串匹配這個表達式:

<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form> 

這個工程預期在紅寶石,但在IRB(1.9.3)我得到以下內容:

1.9.3p448 :147 > data =~/<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form>/ 
=> nil 

我在做什麼錯在這裏?

+1

使用nokogiri爲此...您的預期輸出是什麼?我沒有發生過 –

+0

。我認爲這可能是一個好主意。 – Ramy

回答

0

您需要在此處使用多行正則表達式 - 使用m修飾符(例如,

/<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form>/m 
3

正則表達式和HTML/XML不是好的同牀異形。 HTML改變的可能性是非常好的,你的模式將打破。解析器可以顯着減少代碼中斷的機率。例如,可以很容易地預測,在標籤的參數可以改變它們的順序:

<form method="post" action="https://weblogin.server.com/" name="relay"> 

形式可以改變爲其中之一:

<form method="post" action="https://weblogin.server.com/" name="relay" >...</form> 
<form method="post" action="https://weblogin.server.com/" name="relay1" >...</form> 
<form name="relay" method="post" action="https://weblogin.server.com/">...</form> 
<form name="relay" method="post" action="https://weblogin.server.com/">...</form > 

如果任何這些發生的正則表達式是瞬間破。

解析器不會關心這些更改。

require 'nokogiri' 

html = "<html>\n<head>\n<script language=\"JavaScript\"> \n\n  //////////////////////////////////////////////////////////////// \n  // This [base64 encoder and decoder] was written by Tyler Akins and has been placed in the \n  // public domain. It would be nice if you left this header intact. \n  // Base64 code from Tyler Akins -- http://rumkin.com \n  var keyStr = \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/=\"; \n\n  function encode64(input) { \n   var output = \"\"; \n   var chr1, chr2, chr3; \n   var enc1, enc2, enc3, enc4; \n   var i = 0; \n\n   do { \n    chr1 = input.charCodeAt(i++); \n    chr2 = input.charCodeAt(i++); \n    chr3 = input.charCodeAt(i++); \n\n    enc1 = chr1 >> 2; \n    enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); \n    enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); \n    enc4 = chr3 & 63; \n\n    if (isNaN(chr2)) { \n     enc3 = enc4 = 64; \n    } else if (isNaN(chr3)) { \n     enc4 = 64; \n    } \n\n    output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) + \n     keyStr.charAt(enc3) + keyStr.charAt(enc4); \n   } while (i < input.length); \n\n   return output; \n  } \n  function decode64(input) { \n   var output = \"\"; \n   var chr1, chr2, chr3; \n   var enc1, enc2, enc3, enc4; \n   var i = 0; \n\n   // remove all characters that are not A-Z, a-z, 0-9, +, /, or = \n   input = input.replace(/[^A-Za-z0-9\\+\\/\\=]/g, \"\"); \n\n   do { \n    enc1 = keyStr.indexOf(input.charAt(i++)); \n    enc2 = keyStr.indexOf(input.charAt(i++)); \n    enc3 = keyStr.indexOf(input.charAt(i++)); \n    enc4 = keyStr.indexOf(input.charAt(i++)); \n\n    chr1 = (enc1 << 2) | (enc2 >> 4); \n    chr2 = ((enc2 & 15) << 4) | (enc3 >> 2); \n    chr3 = ((enc3 & 3) << 6) | enc4; \n\n    output = output + String.fromCharCode(chr1); \n\n    if (enc3 != 64) { \n     output = output + String.fromCharCode(chr2); \n    } \n    if (enc4 != 64) { \n     output = output + String.fromCharCode(chr3); \n    } \n   } while (i < input.length); \n\n   return output; \n  } \n\n  // end of Tyler Akins' code \n  //////////////////////////////////////////////////////////////// \n function escapePluses(s) { \n  return s.replace(/\\+/g, \"%2B\"); \n } \n function getFragment(thisuri) { \n  var pound = thisuri.indexOf(\"#\"); \n  if (pound == -1) { \n   return null; \n  } else { \n   return thisuri.substr(pound + 1); \n  } \n } \n function saveFragment() { \n  var fragment = getFragment(document.URL); \n  if (fragment != null) { \n   var pre_marker = \"&aka_frag=\"; \n   var g_req = decode64(document.relay.pubcookie_g_req.value); \n   var header_end = g_req.indexOf(pre_marker) + pre_marker.length; \n   var req_head = g_req.substr(0,header_end); \n   var req_foot = g_req.substr(header_end); \n   if ((req_foot.length > 0) && (req_foot.charAt(0) != '&')) { \n    req_foot = req_foot.substr(req_foot.indexOf(\"&\")); \n   } \n   var new_req = req_head + escapePluses(encode64(fragment)) + req_foot; \n   document.relay.pubcookie_g_req.value = encode64(new_req); \n  } \n } \n\n function doStuff() { \n  saveFragment(); \n  document.relay.submit(); \n } \n\n// setTimeout('doStuff()', 1000); \n</script></head>\n<body onLoad=\"doStuff()\">\n<form method=post action=\"https://weblogin.server.com/\" name=relay>\n<input type=hidden name=pubcookie_g_req value=\"b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA==\">\n<input type=hidden name=post_stuff value=\"\">\n<input type=hidden name=relay_url value=\"https://siam-pro.qa.server.com/PubCookie.reply\">\n<noscript>\n<p align=center>You do not have Javascript turned on, please click the button to continue.\n<p align=center>\n<input type=submit name=go value=Continue>\n</noscript>\n</form>\n</html>\n" 

doc = Nokogiri::HTML(html) 

form = doc.at('form') 
puts form.to_html 
# >> <form method="post" action="https://weblogin.server.com/" name="relay"> 
# >> <input type="hidden" name="pubcookie_g_req" value="b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA=="><input type="hidden" name="post_stuff" value=""><input type="hidden" name="relay_url" value="https://siam-pro.qa.server.com/PubCookie.reply"><noscript> 
# >> <p align="center">You do not have Javascript turned on, please click the button to continue. 
# >> </p> 
# >> <p align="center"> 
# >> <input type="submit" name="go" value="Continue"></p> 
# >> </noscript> 
# >> </form> 

form['action'] # => "https://weblogin.server.com/" 
input = form.at('input') 
input['name'] # => "pubcookie_g_req" 
input['value'] # => "b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA==" 

Nokogiri是Ruby偏愛的XML/HTML解析器。它速度快,易於使用,並且在我的使用中非常健壯。

+0

加1爲徹底的答案,但其他答案確實回答了原來的問題,我的措辭。 SO是一個堅守者的地方,因爲過去我一直堅持着,所以我會在這裏堅持下去。 – Ramy

+0

如果你願意,你可以接受這個答案,Ramy。我的回答只是原始的「這是什麼錯」,這提供了一個更好的方法。 – struthersneil

+0

聽起來不像是給我選擇答案的好理由,但這取決於你。我不會回答不同,並知道什麼是最好的長期解決方案。 –