對於消除數字HTML/XML實體的好實現
和替換它們與ASCII等效?帶有Lua的Unescape數字XML實體
表示爲一個單元測試:
local orig = "It's the "end" &ok; "
local fixd = unescape(orig) -- Implement this
assert(fixd == "It's the \"end\" &ok;\n")
對於消除數字HTML/XML實體的好實現
和替換它們與ASCII等效?帶有Lua的Unescape數字XML實體
表示爲一個單元測試:
local orig = "It's the "end" &ok; "
local fixd = unescape(orig) -- Implement this
assert(fixd == "It's the \"end\" &ok;\n")
這裏有一個簡單的實現,也負責處理核心命名XML實體:
function unescape(str)
str = string.gsub(str, '<', '<')
str = string.gsub(str, '>', '>')
str = string.gsub(str, '"', '"')
str = string.gsub(str, ''', "'")
str = string.gsub(str, '&#(%d+);', function(n) return string.char(n) end)
str = string.gsub(str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end)
str = string.gsub(str, '&', '&') -- Be sure to do this after all others
return str
end
print(unescape(""Hello" 'World'")) --> "Hello" 'World'
但是請注意,這個失敗的一種病理性的情況:一個數字&符實體後跟文字amp;
:
print(unescape("Ampersand entity is &amp;")) --> Ampersand entity is &
-- The result should actually be Ampersand entity is &
我們可以通過處理一次所有的實體解決這個邊緣情況,但代碼得到一個好一點的醜陋:
function unescape(str)
local map={ ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
str = string.gsub(str, '(&(#?x?)([%d%a]+);)', function(orig,n,s)
return (n=='' and map[s])
or (n=="#x" and tonumber(s,16)) and string.char(tonumber(s,16))
or (n=="#" and tonumber(s)) and string.char(s)
or orig
end)
return str
end
print(unescape("Ampersand entity is &amp;")) --> Ampersand entity is &
最後,我們可以解開它多一點速度:
local gsub, char = string.gsub, string.char
local entityMap = {["lt"]="<",["gt"]=">",["amp"]="&",["quot"]='"',["apos"]="'"}
local entitySwap = function(orig,n,s)
return (n=='' and entityMap[s])
or (n=="#" and tonumber(s)) and string.char(s)
or (n=="#x" and tonumber(s,16)) and string.char(tonumber(s,16))
or orig
end
function unescape(str)
return (gsub(str, '(&(#?x?)([%d%a]+);)', entitySwap))
end
對於少數程序員在下載法語html內容時可能需要避免重音,這裏是上述功能的更廣泛的版本。
local function unescape(str)
str = string.gsub(str, ' ', ' ')
str = string.gsub(str, '¡', '¡')
str = string.gsub(str, '¢', '¢')
str = string.gsub(str, '£', '£')
str = string.gsub(str, '¤', '¤')
str = string.gsub(str, '¥', '¥')
str = string.gsub(str, '¦', '¦')
str = string.gsub(str, '§', '§')
str = string.gsub(str, '¨', '¨')
str = string.gsub(str, '©', '©')
str = string.gsub(str, 'ª', 'ª')
str = string.gsub(str, '«', '«')
str = string.gsub(str, '¬', '¬')
str = string.gsub(str, '­', '')
str = string.gsub(str, '®', '®')
str = string.gsub(str, '¯', '¯')
str = string.gsub(str, '°', '°')
str = string.gsub(str, '±', '±')
str = string.gsub(str, '²', '²')
str = string.gsub(str, '³', '³')
str = string.gsub(str, '´', '´')
str = string.gsub(str, 'µ', 'µ')
str = string.gsub(str, '¶', '¶')
str = string.gsub(str, '·', '·')
str = string.gsub(str, '¸', '¸')
str = string.gsub(str, '¹', '¹')
str = string.gsub(str, 'º', 'º')
str = string.gsub(str, '»', '»')
str = string.gsub(str, '¼', '¼')
str = string.gsub(str, '½', '½')
str = string.gsub(str, '¾', '¾')
str = string.gsub(str, '¿', '¿')
str = string.gsub(str, 'À', 'À')
str = string.gsub(str, 'Á', 'Á')
str = string.gsub(str, 'Â', 'Â')
str = string.gsub(str, 'Ã', 'Ã')
str = string.gsub(str, 'Ä', 'Ä')
str = string.gsub(str, 'Å', 'Å')
str = string.gsub(str, 'Æ', 'Æ')
str = string.gsub(str, 'Ç', 'Ç')
str = string.gsub(str, 'È', 'È')
str = string.gsub(str, 'É', 'É')
str = string.gsub(str, 'Ê', 'Ê')
str = string.gsub(str, 'Ë', 'Ë')
str = string.gsub(str, 'Ì', 'Ì')
str = string.gsub(str, 'Í', 'Í')
str = string.gsub(str, 'Î', 'Î')
str = string.gsub(str, 'Ï', 'Ï')
str = string.gsub(str, 'Ð', 'Ð')
str = string.gsub(str, 'Ñ', 'Ñ')
str = string.gsub(str, 'Ò', 'Ò')
str = string.gsub(str, 'Ó', 'Ó')
str = string.gsub(str, 'Ô', 'Ô')
str = string.gsub(str, 'Õ', 'Õ')
str = string.gsub(str, 'Ö', 'Ö')
str = string.gsub(str, '×', '×')
str = string.gsub(str, 'Ø', 'Ø')
str = string.gsub(str, 'Ù', 'Ù')
str = string.gsub(str, 'Ú', 'Ú')
str = string.gsub(str, 'Û', 'Û')
str = string.gsub(str, 'Ü', 'Ü')
str = string.gsub(str, 'Ý', 'Ý')
str = string.gsub(str, 'Þ', 'Þ')
str = string.gsub(str, 'ß', 'ß')
str = string.gsub(str, 'à', 'à')
str = string.gsub(str, 'á', 'á')
str = string.gsub(str, 'â', 'â')
str = string.gsub(str, 'ã', 'ã')
str = string.gsub(str, 'ä', 'ä')
str = string.gsub(str, 'å', 'å')
str = string.gsub(str, 'æ', 'æ')
str = string.gsub(str, 'ç', 'ç')
str = string.gsub(str, 'è', 'è')
str = string.gsub(str, 'é', 'é')
str = string.gsub(str, 'ê', 'ê')
str = string.gsub(str, 'ë', 'ë')
str = string.gsub(str, 'ì', 'ì')
str = string.gsub(str, 'í', 'í')
str = string.gsub(str, 'î', 'î')
str = string.gsub(str, 'ï', 'ï')
str = string.gsub(str, 'ð', 'ð')
str = string.gsub(str, 'ñ', 'ñ')
str = string.gsub(str, 'ò', 'ò')
str = string.gsub(str, 'ó', 'ó')
str = string.gsub(str, 'ô', 'ô')
str = string.gsub(str, 'õ', 'õ')
str = string.gsub(str, 'ö', 'ö')
str = string.gsub(str, '÷', '÷')
str = string.gsub(str, 'ø', 'ø')
str = string.gsub(str, 'ù', 'ù')
str = string.gsub(str, 'ú', 'ú')
str = string.gsub(str, 'û', 'û')
str = string.gsub(str, 'ü', 'ü')
str = string.gsub(str, 'ý', 'ý')
str = string.gsub(str, 'þ', 'þ')
str = string.gsub(str, 'ÿ', 'ÿ')
str = string.gsub(str, '€', '€')
str = string.gsub(str, '&#(%d+);', function(n) return string.char(n) end)
str = string.gsub(str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end)
str = string.gsub(str, '&', '&') -- Be sure to do this after all others
return str
end
應檢查'n ==可 「」'的'entityMap'情況下(或者你可能會不小心匹配'' mp;) – daurnimator 2016-11-10 15:27:49