Здесь частично рабочая замена для file.open (). Это работает с Python 2.6, но на Python 3.1 я получаю сообщение об ошибке:
Traceback (most recent call last):
File "unicode-file.py", line 15, in <module>
old_file_write = file.write
NameError: name 'file' is not defined
Unicode-дружественная замена file.open ()
#!/usr/bin/python
import codecs, sys, types
# we save the file function handler because we want to override it
open_old = open
# on Python 3.x we overwrite write method in order to make it accept bytes in addition to str
old_file_write = file.write
class file():
def write(self, d):
if isinstance(d, types.bytes):
self.buffer.write(d)
else:
old_file_write(d)
def open(filename, mode=None, bufsize=None):
#try:
# we read the first 4 bytes just to be sure we use the right encoding
if(mode == "r"): # we are interested of detecting the mode only for read text
f = open_old(filename, "rb")
aBuf = f.read(4)
if aBuf[:3] == '\xEF\xBB\xBF' :
f = codecs.open(filename, mode, "utf_8")
f.seek(3,0)
elif aBuf[:4] == '\xFF\xFE\x00\x00':
f = codecs.open(filename, mode, "utf_32_le")
f.seek(4,0)
elif aBuf[:4] == '\x00\x00\xFE\xFF':
f = codecs.open(filename, mode, "utf_32_be")
f.seek(4,0)
elif aBuf[:2] == '\xFF\xFE':
f = codecs.open(filename, mode, "utf_16_le")
f.seek(2,0)
elif aBuf[:2] == '\xFE\xFF':
f = codecs.open(filename, mode, "utf_16_be")
f.seek(2,0)
else: # we assume that if there is no BOM, the encoding is UTF-8
f.close()
f = codecs.open(filename, mode, "utf-8")
f.seek(0)
return f
else:
return open_old(filename, mode, bufsize)
# now use the open(file, "r")