2013-05-18 22 views

回答

2

嘗試lua-iconv,結合的iconv到Lua。

+0

我嘗試過,但不幸的是dos未從勝利-1256轉換to utf-8 謝謝 – Ali

+0

@Ali:你確定嗎?您必須使用從iconv -l獲得的(不區分大小寫)名稱之一。 arabic2utf = iconv.new(「cp1256」,「utf8」); output_string = arabic2utf:iconv(input_string)'工作? –

1
local win2utf_list = [[ 
0x00 0x0000 #NULL 
0x01 0x0001 #START OF HEADING 
0x02 0x0002 #START OF TEXT 
-- Download full text from 
-- http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT 
0xFD 0x200E #LEFT-TO-RIGHT MARK 
0xFE 0x200F #RIGHT-TO-LEFT MARK 
0xFF 0x06D2 #ARABIC LETTER YEH BARREE 
]] 

local win2utf = {} 

for w, u in win2utf_list:gmatch'0x(%x%x)%s+0x(%x+)' do 
    local c, t, h = tonumber(u,16), {}, 128 
    while c >= h do 
     t[#t+1] = 128 + c%64 
     c = math.floor(c/64) 
     h = h > 32 and 32 or h/2 
    end 
    t[#t+1] = 256 - 2*h + c 
    win2utf[w.char(tonumber(w,16))] = 
     w.char((table.unpack or unpack)(t)):reverse() 
end 

local function convert_to_utf8(win_string) 
    return win_string:gsub('.', win2utf) 
end 
0

Windows-1256是設計爲ASCII的8位重疊字符集之一。因此它有256個字符,每個字符編碼爲一個字節。

UTF-8是Unicode字符集的編碼。它是「通用」,它是一個Windows-1256字符集的超集。所以,通過使用「替代字符」代替不屬於字符集的成員的信息,信息不會丟失。

轉換是將每個字符的Windows-1256字節轉換爲對應的UTF-8字節的簡單方法。查找表是一種簡單的方法。

local encoding = { 
-- table maps the one byte Windows-1256 encoding for a character to a Lua string with the UTF-8 encoding for the character 

"\000"  , "\001"  , "\002"  , "\003"  , "\004"  , "\005"  , "\006"  , "\007"  , 
"\008"  , "\009"  , "\010"  , "\011"  , "\012"  , "\013"  , "\014"  , "\015"  , 
"\016"  , "\017"  , "\018"  , "\019"  , "\020"  , "\021"  , "\022"  , "\023"  , 
"\024"  , "\025"  , "\026"  , "\027"  , "\028"  , "\029"  , "\030"  , "\031"  , 
"\032"  , "\033"  , "\034"  , "\035"  , "\036"  , "\037"  , "\038"  , "\039"  , 
"\040"  , "\041"  , "\042"  , "\043"  , "\044"  , "\045"  , "\046"  , "\047"  , 
"\048"  , "\049"  , "\050"  , "\051"  , "\052"  , "\053"  , "\054"  , "\055"  , 
"\056"  , "\057"  , "\058"  , "\059"  , "\060"  , "\061"  , "\062"  , "\063"  , 
"\064"  , "\065"  , "\066"  , "\067"  , "\068"  , "\069"  , "\070"  , "\071"  , 
"\072"  , "\073"  , "\074"  , "\075"  , "\076"  , "\077"  , "\078"  , "\079"  , 
"\080"  , "\081"  , "\082"  , "\083"  , "\084"  , "\085"  , "\086"  , "\087"  , 
"\088"  , "\089"  , "\090"  , "\091"  , "\092"  , "\093"  , "\094"  , "\095"  , 
"\096"  , "\097"  , "\098"  , "\099"  , "\100"  , "\101"  , "\102"  , "\103"  , 
"\104"  , "\105"  , "\106"  , "\107"  , "\108"  , "\109"  , "\110"  , "\111"  , 
"\112"  , "\113"  , "\114"  , "\115"  , "\116"  , "\117"  , "\118"  , "\119"  , 
"\120"  , "\121"  , "\122"  , "\123"  , "\124"  , "\125"  , "\126"  , "\127"  , 
"\226\130\172", "\217\190" , "\226\128\154", "\198\146" , "\226\128\158", "\226\128\166", "\226\128\160", "\226\128\161", 
"\203\134" , "\226\128\176", "\217\185" , "\226\128\185", "\197\146" , "\218\134" , "\218\152" , "\218\136" , 
"\218\175" , "\226\128\152", "\226\128\153", "\226\128\156", "\226\128\157", "\226\128\162", "\226\128\147", "\226\128\148", 
"\218\169" , "\226\132\162", "\218\145" , "\226\128\186", "\197\147" , "\226\128\140", "\226\128\141", "\218\186" , 
"\194\160" , "\216\140" , "\194\162" , "\194\163" , "\194\164" , "\194\165" , "\194\166" , "\194\167" , 
"\194\168" , "\194\169" , "\218\190" , "\194\171" , "\194\172" , "\194\173" , "\194\174" , "\194\175" , 
"\194\176" , "\194\177" , "\194\178" , "\194\179" , "\194\180" , "\194\181" , "\194\182" , "\194\183" , 
"\194\184" , "\194\185" , "\216\155" , "\194\187" , "\194\188" , "\194\189" , "\194\190" , "\216\159" , 
"\219\129" , "\216\161" , "\216\162" , "\216\163" , "\216\164" , "\216\165" , "\216\166" , "\216\167" , 
"\216\168" , "\216\169" , "\216\170" , "\216\171" , "\216\172" , "\216\173" , "\216\174" , "\216\175" , 
"\216\176" , "\216\177" , "\216\178" , "\216\179" , "\216\180" , "\216\181" , "\216\182" , "\195\151" , 
"\216\183" , "\216\184" , "\216\185" , "\216\186" , "\217\128" , "\217\129" , "\217\130" , "\217\131" , 
"\195\160" , "\217\132" , "\195\162" , "\217\133" , "\217\134" , "\217\135" , "\217\136" , "\195\167" , 
"\195\168" , "\195\169" , "\195\170" , "\195\171" , "\217\137" , "\217\138" , "\195\174" , "\195\175" , 
"\217\139" , "\217\140" , "\217\141" , "\217\142" , "\195\180" , "\217\143" , "\217\144" , "\195\183" , 
"\217\145" , "\195\185" , "\217\146" , "\195\187" , "\195\188" , "\226\128\142", "\226\128\143", "\219\146" 
} 

-

encoding.convert = function(str) 
    assert(type(str) == "string", "Parameter 1 must be a string") 
    local result = {} 
    for i = 1, string.len(str) do 
     table.insert(result, encoding[string.byte(str,i)+1]) 
    end 
    return table.concat(result) 
end 
assert(encoding.convert("test1") == "test1", "test1 failed") 

參考文獻:

喬爾斯波斯基,The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)

羅伯託·薩利姆斯,Creating Strings Piece by Piece