2013-04-03 193 views
13

我可以讀取該文件的字節數組如何讀取utf16文本文件在golang中的字符串?

,但是當我將它轉換爲字符串

它處理UTF-16字節爲ASCII

如何將其正確轉換?

package main 

import ("fmt" 
"os" 
"bufio" 
) 

func main(){ 
    // read whole the file 
    f, err := os.Open("test.txt") 
    if err != nil { 
     fmt.Printf("error opening file: %v\n",err) 
     os.Exit(1) 
    } 
    r := bufio.NewReader(f) 
    var s,b,e = r.ReadLine() 
    if e==nil{ 
     fmt.Println(b) 
     fmt.Println(s) 
     fmt.Println(string(s)) 
    } 
} 

輸出:

[255 254 91 0 83 0 99 0 114 0 105 0 112 0 116 0 32 0 73 0 110 0 102 0 111 0 93 0 13 0]

小號CRIPT我NFO]


更新:

我測試了這兩個示例後,我已經詳細說明了現在的確切問題。

在windows中,如果我在行的末尾添加換行符(CR + LF),則CR將在行中讀取。由於readline函數無法正確處理unicode([OD OA] = ok,[OD 00 OA 00] = not ok)。

如果readline函數可以識別unicode,它應該理解[OD 00 OA 00]並返回[] uint16而不是[]字節。

所以我認爲我不應該使用bufio.NewReader,因爲它不能讀取utf16,我沒有看到bufio.NewReader.ReadLine可以接受參數作爲標誌來指示讀取文本是utf8,utf16le/be還是UTF32。 go庫中是否有unicode文本的readline函數?

回答

13

UTF16,UTF8,和字節順序標記是由Unicode Consortium定義:UTF-16 FAQUTF-8 FAQByte Order Mark (BOM) FAQ。從文件


Issue 4802: bufio: reading lines is too cumbersome

讀線在Go太麻煩了。

人們常常吸引到bufio.Reader.ReadLine因爲它的名字, 但它有一個奇怪的簽名,返回(行[]字節,isPrefix布爾, ERR錯誤),並且需要大量的工作。

ReadSlice和ReadString需要一個分隔符字節,這幾乎是 總是顯而易見的,難看的 '\ n',並且還可以返回既有線 和EOF


Revision: f685026a2d38

bufio:新的掃描儀接口

爲sc添加一個新的簡單界面(可能是文本)數據, 基於稱爲掃描儀的新類型。它自己的內部 緩衝,所以即使沒有注入bufio.Reader也應該是高效的。輸入格式由「分割 函數」定義,默認分割成行。


go1.1beta1 released

您可以從老地方下載的二進制和源代碼分發: https://code.google.com/p/go/downloads/list?q=go1.1beta1


下面是它使用Unicode規則轉換UTF16程序文本文件行以Go UTF8編碼的字符串。該代碼已被修改,以利用Go 1.1中新的bufio.Scanner界面。

package main 

import (
    "bufio" 
    "bytes" 
    "encoding/binary" 
    "fmt" 
    "os" 
    "runtime" 
    "unicode/utf16" 
    "unicode/utf8" 
) 

// UTF16BytesToString converts UTF-16 encoded bytes, in big or little endian byte order, 
// to a UTF-8 encoded string. 
func UTF16BytesToString(b []byte, o binary.ByteOrder) string { 
    utf := make([]uint16, (len(b)+(2-1))/2) 
    for i := 0; i+(2-1) < len(b); i += 2 { 
     utf[i/2] = o.Uint16(b[i:]) 
    } 
    if len(b)/2 < len(utf) { 
     utf[len(utf)-1] = utf8.RuneError 
    } 
    return string(utf16.Decode(utf)) 
} 

// UTF-16 endian byte order 
const (
    unknownEndian = iota 
    bigEndian 
    littleEndian 
) 

// dropCREndian drops a terminal \r from the endian data. 
func dropCREndian(data []byte, t1, t2 byte) []byte { 
    if len(data) > 1 { 
     if data[len(data)-2] == t1 && data[len(data)-1] == t2 { 
      return data[0 : len(data)-2] 
     } 
    } 
    return data 
} 

// dropCRBE drops a terminal \r from the big endian data. 
func dropCRBE(data []byte) []byte { 
    return dropCREndian(data, '\x00', '\r') 
} 

// dropCRLE drops a terminal \r from the little endian data. 
func dropCRLE(data []byte) []byte { 
    return dropCREndian(data, '\r', '\x00') 
} 

// dropCR drops a terminal \r from the data. 
func dropCR(data []byte) ([]byte, int) { 
    var endian = unknownEndian 
    switch ld := len(data); { 
    case ld != len(dropCRLE(data)): 
     endian = littleEndian 
    case ld != len(dropCRBE(data)): 
     endian = bigEndian 
    } 
    return data, endian 
} 

// SplitFunc is a split function for a Scanner that returns each line of 
// text, stripped of any trailing end-of-line marker. The returned line may 
// be empty. The end-of-line marker is one optional carriage return followed 
// by one mandatory newline. In regular expression notation, it is `\r?\n`. 
// The last non-empty line of input will be returned even if it has no 
// newline. 
func ScanUTF16LinesFunc(byteOrder binary.ByteOrder) (bufio.SplitFunc, func() binary.ByteOrder) { 

    // Function closure variables 
    var endian = unknownEndian 
    switch byteOrder { 
    case binary.BigEndian: 
     endian = bigEndian 
    case binary.LittleEndian: 
     endian = littleEndian 
    } 
    const bom = 0xFEFF 
    var checkBOM bool = endian == unknownEndian 

    // Scanner split function 
    splitFunc := func(data []byte, atEOF bool) (advance int, token []byte, err error) { 

     if atEOF && len(data) == 0 { 
      return 0, nil, nil 
     } 

     if checkBOM { 
      checkBOM = false 
      if len(data) > 1 { 
       switch uint16(bom) { 
       case uint16(data[0])<<8 | uint16(data[1]): 
        endian = bigEndian 
        return 2, nil, nil 
       case uint16(data[1])<<8 | uint16(data[0]): 
        endian = littleEndian 
        return 2, nil, nil 
       } 
      } 
     } 

     // Scan for newline-terminated lines. 
     i := 0 
     for { 
      j := bytes.IndexByte(data[i:], '\n') 
      if j < 0 { 
       break 
      } 
      i += j 
      switch e := i % 2; e { 
      case 1: // UTF-16BE 
       if endian != littleEndian { 
        if i > 1 { 
         if data[i-1] == '\x00' { 
          endian = bigEndian 
          // We have a full newline-terminated line. 
          return i + 1, dropCRBE(data[0 : i-1]), nil 
         } 
        } 
       } 
      case 0: // UTF-16LE 
       if endian != bigEndian { 
        if i+1 < len(data) { 
         i++ 
         if data[i] == '\x00' { 
          endian = littleEndian 
          // We have a full newline-terminated line. 
          return i + 1, dropCRLE(data[0 : i-1]), nil 
         } 
        } 
       } 
      } 
      i++ 
     } 

     // If we're at EOF, we have a final, non-terminated line. Return it. 
     if atEOF { 
      // drop CR. 
      advance = len(data) 
      switch endian { 
      case bigEndian: 
       data = dropCRBE(data) 
      case littleEndian: 
       data = dropCRLE(data) 
      default: 
       data, endian = dropCR(data) 
      } 
      if endian == unknownEndian { 
       if runtime.GOOS == "windows" { 
        endian = littleEndian 
       } else { 
        endian = bigEndian 
       } 
      } 
      return advance, data, nil 
     } 

     // Request more data. 
     return 0, nil, nil 
    } 

    // Endian byte order function 
    orderFunc := func() (byteOrder binary.ByteOrder) { 
     switch endian { 
     case bigEndian: 
      byteOrder = binary.BigEndian 
     case littleEndian: 
      byteOrder = binary.LittleEndian 
     } 
     return byteOrder 
    } 

    return splitFunc, orderFunc 
} 

func main() { 
    file, err := os.Open("utf16.le.txt") 
    if err != nil { 
     fmt.Println(err) 
     os.Exit(1) 
    } 
    defer file.Close() 
    fmt.Println(file.Name()) 

    rdr := bufio.NewReader(file) 
    scanner := bufio.NewScanner(rdr) 
    var bo binary.ByteOrder // unknown, infer from data 
    // bo = binary.LittleEndian // windows 
    splitFunc, orderFunc := ScanUTF16LinesFunc(bo) 
    scanner.Split(splitFunc) 

    for scanner.Scan() { 
     b := scanner.Bytes() 
     s := UTF16BytesToString(b, orderFunc()) 
     fmt.Println(len(s), s) 
     fmt.Println(len(b), b) 
    } 
    fmt.Println(orderFunc()) 

    if err := scanner.Err(); err != nil { 
     fmt.Println(err) 
    } 
} 

輸出:

utf16.le.txt 
15 "Hello, 世界" 
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0] 
0 
0 [] 
15 "Hello, 世界" 
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0] 
LittleEndian 

utf16.be.txt 
15 "Hello, 世界" 
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34] 
0 
0 [] 
15 "Hello, 世界" 
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34] 
BigEndian 
+0

現在我明白了沒有轉換的問題,它在readline中。所以問題被更新。 –

+0

這是修復您的問題的修改程序。 – peterSO

+0

感謝您的程序,我會根據您的修訂版對其進行修改,因爲換行符仍然有許多標準[鏈接](http://en.wikipedia.org/wiki/Newline)。由於沒有包去閱讀utf16,我想我也應該向google報告這個問題,因爲現代編程語言應該能夠正確處理unicode,尤其是在互聯網應用程序中。 –

4

例如:

package main 

import (
     "errors" 
     "fmt" 
     "log" 
     "unicode/utf16" 
) 

func utf16toString(b []uint8) (string, error) { 
     if len(b)&1 != 0 { 
       return "", errors.New("len(b) must be even") 
     } 

     // Check BOM 
     var bom int 
     if len(b) >= 2 { 
       switch n := int(b[0])<<8 | int(b[1]); n { 
       case 0xfffe: 
         bom = 1 
         fallthrough 
       case 0xfeff: 
         b = b[2:] 
       } 
     } 

     w := make([]uint16, len(b)/2) 
     for i := range w { 
       w[i] = uint16(b[2*i+bom&1])<<8 | uint16(b[2*i+(bom+1)&1]) 
     } 
     return string(utf16.Decode(w)), nil 
} 

func main() { 
     // Simulated data from e.g. a file 
     b := []byte{255, 254, 91, 0, 83, 0, 99, 0, 114, 0, 105, 0, 112, 0, 116, 0, 32, 0, 73, 0, 110, 0, 102, 0, 111, 0, 93, 0, 13, 0} 
     s, err := utf16toString(b) 
     if err != nil { 
       log.Fatal(err) 
     } 

     fmt.Printf("%q", s) 
} 

(也here

輸出:


"[Script Info]\r" 
+0

我還建議使用'encoding/binary'來讀取它作爲[] uint16開頭。 – cthom06

+0

@ cthom06:我不會推薦。 – zzzz

+0

@ cthom06爲什麼?請注意,UTF16中的字符並不總是以兩個字節編碼(這僅適用於BMP)。 –

6

golang.org/x/text/encoding/unicode最新版本可以更容易地做到這一點,因爲它包括unicode.BOMOverride,這將智能地解釋BOM。

這裏是ReadFileUTF16(),它與os.ReadFile()類似,但解碼UTF-16。

package main 

import (
    "bytes" 
    "fmt" 
    "io/ioutil" 
    "log" 
    "strings" 

    "golang.org/x/text/encoding/unicode" 
    "golang.org/x/text/transform" 
) 

// Similar to ioutil.ReadFile() but decodes UTF-16. Useful when 
// reading data from MS-Windows systems that generate UTF-16BE files, 
// but will do the right thing if other BOMs are found. 
func ReadFileUTF16(filename string) ([]byte, error) { 

    // Read the file into a []byte: 
    raw, err := ioutil.ReadFile(filename) 
    if err != nil { 
     return nil, err 
    } 

    // Make an tranformer that converts MS-Win default to UTF8: 
    win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM) 
    // Make a transformer that is like win16be, but abides by BOM: 
    utf16bom := unicode.BOMOverride(win16be.NewDecoder()) 

    // Make a Reader that uses utf16bom: 
    unicodeReader := transform.NewReader(bytes.NewReader(raw), utf16bom) 

    // decode and print: 
    decoded, err := ioutil.ReadAll(unicodeReader) 
    return decoded, err 
} 

func main() { 
    data, err := ReadFileUTF16("inputfile.txt") 
    if err != nil { 
     log.Fatal(err) 
    } 
    final := strings.Replace(string(data), "\r\n", "\n", -1) 
    fmt.Println(final) 

} 

這裏是NewScannerUTF16這就像os.Open(),但返回一個掃描儀。

package main 

import (
    "bufio" 
    "fmt" 
    "log" 
    "os" 

    "golang.org/x/text/encoding/unicode" 
    "golang.org/x/text/transform" 
) 

type utfScanner interface { 
    Read(p []byte) (n int, err error) 
} 

// Creates a scanner similar to os.Open() but decodes the file as UTF-16. 
// Useful when reading data from MS-Windows systems that generate UTF-16BE 
// files, but will do the right thing if other BOMs are found. 
func NewScannerUTF16(filename string) (utfScanner, error) { 

    // Read the file into a []byte: 
    file, err := os.Open(filename) 
    if err != nil { 
     return nil, err 
    } 

    // Make an tranformer that converts MS-Win default to UTF8: 
    win16be := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM) 
    // Make a transformer that is like win16be, but abides by BOM: 
    utf16bom := unicode.BOMOverride(win16be.NewDecoder()) 

    // Make a Reader that uses utf16bom: 
    unicodeReader := transform.NewReader(file, utf16bom) 
    return unicodeReader, nil 
} 

func main() { 

    s, err := NewScannerUTF16("inputfile.txt") 
    if err != nil { 
     log.Fatal(err) 
    } 

    scanner := bufio.NewScanner(s) 
    for scanner.Scan() { 
     fmt.Println(scanner.Text()) // Println will add back the final '\n' 
    } 
    if err := scanner.Err(); err != nil { 
     fmt.Fprintln(os.Stderr, "reading inputfile:", err) 
    } 

} 
相關問題