2010-06-14 33 views

回答

2

的解決方案建議here似乎仍然對我好(即代碼。下面是修改後的版本,仍然在Python 2,而不是Python 3,並與使用示例):

#!/usr/bin/python 
# -*- coding: utf-8 -*- 

import codecs, logging, sys 
logging.basicConfig(level=logging.INFO) 
bomdict = { 
    codecs.BOM_UTF8 : 'UTF8', 
    codecs.BOM_UTF16_BE : 'UTF-16BE', 
    codecs.BOM_UTF16_LE : 'UTF-16LE' } 

def read_unicode(filename): 
    the_text = open(filename, 'r').read() 
    for bom, encoding in bomdict.items(): 
     if the_text.startswith(bom): 
      logging.info('BOM found, using %s', encoding) 
      the_text = the_text[len(bom):] 
      break 
    else: 
     logging.info('No BOM, using utf8') 
     encoding = 'UTF8' 
    return the_text.decode(encoding) 

f = open('x.txt', 'wb') 
f.write(codecs.BOM_UTF16_LE) 
f.write(u'zeé fóo!'.encode('UTF-16LE')) 
f.close() 

print read_unicode('x.txt') 
1

這部分工作替代file.open()。它與Python 2.6的工作,但在Python 3.1中,我得到一個錯誤:

Traceback (most recent call last): 
    File "unicode-file.py", line 15, in <module> 
    old_file_write = file.write 
NameError: name 'file' is not defined 

Unicode的友好file.open()的替代

#!/usr/bin/python 
import codecs, sys, types 

# we save the file function handler because we want to override it 
open_old = open 

# on Python 3.x we overwrite write method in order to make it accept bytes in addition to str 
old_file_write = file.write 

class file(): 
    def write(self, d): 
     if isinstance(d, types.bytes): 
      self.buffer.write(d) 
     else: 
      old_file_write(d) 

def open(filename, mode=None, bufsize=None): 
    #try: 
     # we read the first 4 bytes just to be sure we use the right encoding 
     if(mode == "r"): # we are interested of detecting the mode only for read text 
      f = open_old(filename, "rb") 
      aBuf = f.read(4) 
      if aBuf[:3] == '\xEF\xBB\xBF' : 
       f = codecs.open(filename, mode, "utf_8") 
       f.seek(3,0) 
      elif aBuf[:4] == '\xFF\xFE\x00\x00': 
       f = codecs.open(filename, mode, "utf_32_le") 
       f.seek(4,0) 
      elif aBuf[:4] == '\x00\x00\xFE\xFF': 
       f = codecs.open(filename, mode, "utf_32_be") 
       f.seek(4,0) 
      elif aBuf[:2] == '\xFF\xFE': 
       f = codecs.open(filename, mode, "utf_16_le") 
       f.seek(2,0) 
      elif aBuf[:2] == '\xFE\xFF': 
       f = codecs.open(filename, mode, "utf_16_be") 
       f.seek(2,0) 
      else: # we assume that if there is no BOM, the encoding is UTF-8 
       f.close() 
       f = codecs.open(filename, mode, "utf-8") 
       f.seek(0) 
      return f 
     else: 
      return open_old(filename, mode, bufsize) 

# now use the open(file, "r") 
1

我已經感動了Alex的和索林的例子,在python3工作以及python2:

import codecs 
import io 

_boms = [ 
    (codecs.BOM_UTF8, 'utf-8-sig', 0), 
    (codecs.BOM_UTF32_LE, 'utf-32le', 4), 
    (codecs.BOM_UTF32_BE, 'utf-32be', 4), 
    (codecs.BOM_UTF16_LE, 'utf-16le', 2), 
    (codecs.BOM_UTF16_BE, 'utf-16be', 2)] 


def read_unicode(file_path): 
    with io.open(file_path, 'rb') as f: 
     data = f.read(4) 
    for bom, encoding, seek_to in _boms: 
     if data.startswith(bom): 
      break 
    else: 
     encoding, seek_to = 'utf-8', 0 
    with io.open(file_path, 'r', encoding=encoding) as f: 
     f.seek(seek_to) 
     return f.read()