string.decode 自定义错误参数
我有一段Python 2.7的代码:
# coding: utf-8
#
f = open('data.txt', 'r')
for line in f:
line = line.decode(encoding='utf-8', errors='foo23')
print len(line)
f.close()
为什么Python不会报错呢?因为只有以下这些有效的错误处理方式被注册:
- strict(严格)
- ignore(忽略)
- replace(替换)
- xmlcharrefreplace(XML字符引用替换)
- backslashreplace(反斜杠替换)
文档上说你可以注册自己的错误处理方式,但我并没有注册'foo23',而且Python代码还是正常运行,没有报错或警告。如果我改变编码参数,就会报错,但如果我把错误处理方式改成一个自定义的字符串,所有的事情就都没问题了。
line = line.decode(encoding='utf-9', errors='foo23')
File "parse.py", line 7, in <module>
line = line.decode(encoding='utf-9', errors='foo23')
LookupError: unknown encoding: utf-9
3 个回答
正如jfs的回答所说,当你提供一个无效的错误处理器时,Python通常不会报错,因为在解码没有错误的时候,Python并不会检查这个错误处理器是否有效。
不过,这种行为其实是依赖于具体的实现方式的。你会发现,在CPython中,encode
和decode
函数在遇到错误之前并不会检查错误处理器是否存在。
而在IronPython中,encode
和decode
函数在尝试编码或解码之前会检查指定的错误处理器是否存在,因此你给出的示例代码会产生这样的错误:
Traceback (most recent call last):
File ".\code.py", line 6, in <module>
LookupError: unknown error handler name 'foo23'
当然,其他的Python实现可能在这种情况下表现不同。
我想确认CPython确实是在遇到解码错误时才验证错误处理器,而IronPython则不是,所以我查看了这两种实现的源代码。
CPython
下面是Python 2.6.2中unicodeobject.c
文件里的PyUnicode_DecodeUTF8Stateful
函数的代码。这个函数似乎负责大部分UTF-8编码字节的解码工作。
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
const char *starts = s;
int n;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
/* Note: size will always be longer than the resulting Unicode
character count */
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
if (size == 0) {
if (consumed)
*consumed = 0;
return (PyObject *)unicode;
}
/* Unpack UTF-8 encoded data */
p = unicode->str;
e = s + size;
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
*p++ = (Py_UNICODE)ch;
s++;
continue;
}
n = utf8_code_length[ch];
if (s + n > e) {
if (consumed)
break;
else {
errmsg = "unexpected end of data";
startinpos = s-starts;
endinpos = size;
goto utf8Error;
}
}
switch (n) {
case 0:
errmsg = "unexpected code byte";
startinpos = s-starts;
endinpos = startinpos+1;
goto utf8Error;
case 1:
errmsg = "internal error";
startinpos = s-starts;
endinpos = startinpos+1;
goto utf8Error;
case 2:
if ((s[1] & 0xc0) != 0x80) {
errmsg = "invalid data";
startinpos = s-starts;
endinpos = startinpos+2;
goto utf8Error;
}
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
if (ch < 0x80) {
startinpos = s-starts;
endinpos = startinpos+2;
errmsg = "illegal encoding";
goto utf8Error;
}
else
*p++ = (Py_UNICODE)ch;
break;
case 3:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80) {
errmsg = "invalid data";
startinpos = s-starts;
endinpos = startinpos+3;
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
if (ch < 0x0800) {
/* Note: UTF-8 encodings of surrogates are considered
legal UTF-8 sequences;
XXX For wide builds (UCS-4) we should probably try
to recombine the surrogates into a single code
unit.
*/
errmsg = "illegal encoding";
startinpos = s-starts;
endinpos = startinpos+3;
goto utf8Error;
}
else
*p++ = (Py_UNICODE)ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80) {
errmsg = "invalid data";
startinpos = s-starts;
endinpos = startinpos+4;
goto utf8Error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */
if ((ch < 0x10000) /* minimum value allowed for 4
byte encoding */
|| (ch > 0x10ffff)) /* maximum value allowed for
UTF-16 */
{
errmsg = "illegal encoding";
startinpos = s-starts;
endinpos = startinpos+4;
goto utf8Error;
}
#ifdef Py_UNICODE_WIDE
*p++ = (Py_UNICODE)ch;
#else
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */
ch -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
/* low surrogate = bottom 10 bits added to DC00 */
*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
#endif
break;
default:
/* Other sizes are only needed for UCS-4 */
errmsg = "unsupported Unicode code range";
startinpos = s-starts;
endinpos = startinpos+n;
goto utf8Error;
}
s += n;
continue;
utf8Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf8", errmsg,
starts, size, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos, &p))
goto onError;
}
if (consumed)
*consumed = s-starts;
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return (PyObject *)unicode;
onError:
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
Py_DECREF(unicode);
return NULL;
}
我们可以看到这个函数调用了另一个函数unicode_decode_call_errorhandler
,这个函数才是真正使用错误处理器的地方。这个函数的代码如下:
static
int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
const char *encoding, const char *reason,
const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
{
static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
PyObject *restuple = NULL;
PyObject *repunicode = NULL;
Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Py_ssize_t requiredsize;
Py_ssize_t newpos;
Py_UNICODE *repptr;
Py_ssize_t repsize;
int res = -1;
if (*errorHandler == NULL) {
*errorHandler = PyCodec_LookupError(errors);
if (*errorHandler == NULL)
goto onError;
}
if (*exceptionObject == NULL) {
*exceptionObject = PyUnicodeDecodeError_Create(
encoding, input, insize, *startinpos, *endinpos, reason);
if (*exceptionObject == NULL)
goto onError;
}
else {
if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
goto onError;
if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
goto onError;
if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
goto onError;
}
restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
if (restuple == NULL)
goto onError;
if (!PyTuple_Check(restuple)) {
PyErr_SetString(PyExc_TypeError, &argparse[4]);
goto onError;
}
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
goto onError;
if (newpos<0)
newpos = insize+newpos;
if (newpos<0 || newpos>insize) {
PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
goto onError;
}
/* need more space? (at least enough for what we
have+the replacement+the rest of the string (starting
at the new input position), so we won't have to check space
when there are no errors in the rest of the string) */
repptr = PyUnicode_AS_UNICODE(repunicode);
repsize = PyUnicode_GET_SIZE(repunicode);
requiredsize = *outpos + repsize + insize-newpos;
if (requiredsize > outsize) {
if (requiredsize<2*outsize)
requiredsize = 2*outsize;
if (_PyUnicode_Resize(output, requiredsize) < 0)
goto onError;
*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
}
*endinpos = newpos;
*inptr = input + newpos;
Py_UNICODE_COPY(*outptr, repptr, repsize);
*outptr += repsize;
*outpos += repsize;
/* we made it! */
res = 0;
onError:
Py_XDECREF(restuple);
return res;
}
由于PyUnicode_DecodeUTF8Stateful
用一个NULL
的错误处理器调用了unicode_decode_call_errorhandler
,所以unicode_decode_call_errorhandler
会调用PyCodec_LookupError
,这个函数最终会验证提供的错误处理器。请看下面的代码。
PyObject *PyCodec_LookupError(const char *name)
{
PyObject *handler = NULL;
PyInterpreterState *interp = PyThreadState_GET()->interp;
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
return NULL;
if (name==NULL)
name = "strict";
handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
if (!handler)
PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
else
Py_INCREF(handler);
return handler;
}
注意,PyUnicode_DecodeUTF8Stateful
中调用unicode_decode_call_errorhandler
的代码是在utf8Error标签下,只有在解码遇到错误时才能到达这个部分。
IronPython
在IronPython 2.7.9中,解码是在下面的StringOps.DoDecode
函数中处理的(在StringOps.cs
文件里)。
internal static string DoDecode(CodeContext context, string s, string errors, string encoding, Encoding e, bool final, out int numBytes) {
byte[] bytes = s.MakeByteArray();
int start = GetStartingOffset(e, bytes);
numBytes = bytes.Length - start;
#if FEATURE_ENCODING
// CLR's encoder exceptions have a 1-1 mapping w/ Python's encoder exceptions
// so we just clone the encoding & set the fallback to throw in strict mode.
e = (Encoding)e.Clone();
switch (errors) {
case "backslashreplace":
case "xmlcharrefreplace":
case "strict": e.DecoderFallback = final ? DecoderFallback.ExceptionFallback : new ExceptionFallBack(numBytes, e is UTF8Encoding); break;
case "replace": e.DecoderFallback = ReplacementFallback; break;
case "ignore": e.DecoderFallback = new PythonDecoderFallback(encoding, s, null); break;
default:
e.DecoderFallback = new PythonDecoderFallback(encoding, s, LightExceptions.CheckAndThrow(PythonOps.LookupEncodingError(context, errors)));
break;
}
#endif
string decoded = e.GetString(bytes, start, numBytes);
#if FEATURE_ENCODING
if (e.DecoderFallback is ExceptionFallBack fallback) {
byte[] badBytes = fallback.buffer.badBytes;
if (badBytes != null) {
numBytes -= badBytes.Length;
}
}
#endif
return decoded;
}
在这里,DoDecode
函数在解码之前的switch
语句中创建错误处理器。如果包含错误处理器名称的字符串(errors
)不是已知的内置处理器之一,DoDecode
会创建一个PythonDecoderFallback
对象,这个对象是通过PythonOps.LookupEncodingError
函数从注册的错误处理器字典中获取的Python函数对象(见下文)。
[LightThrowing]
internal static object LookupEncodingError(CodeContext/*!*/ context, string name) {
Dictionary<string, object> errorHandlers = context.LanguageContext.ErrorHandlers;
lock (errorHandlers) {
if (errorHandlers.ContainsKey(name))
return errorHandlers[name];
else
return LightExceptions.Throw(PythonOps.LookupError("unknown error handler name '{0}'", name));
}
}
当LookupEncodingError
在errorHandlers
字典中找不到给定name
的错误处理器时,它会“抛出”一个LookupError LightException
——也就是说,它会创建一个LightException
对象并返回。这个对象随后会被LightExceptions.CheckAndThrow
函数检查,最终会产生在IronPython中调用decode
时遇到的“未知错误处理器名称”的错误。
再次强调,所有这些都发生在DoDecode
中,在调用Encoding
对象的GetString
方法之前,因此无论是否有解码错误,IronPython在遇到无效错误处理器时都会产生错误。
在这里,errors
这个参数是用来告诉str.decode()
函数你希望如何处理错误的,它本身不会引发任何错误。你在第二个例子中遇到错误的原因是因为你给这个函数传递了一个无效的encoding
参数,除此之外没有其他原因。
如果在解码的过程中没有出现错误,那么errors
这个参数就不会被用到,它的值无所谓,只要是个字符串就行:
>>> b'\x09'.decode('utf-8', errors='abc')
u'\t'
如果字节无法使用指定的编码进行解码,那么就会使用错误处理程序。如果你指定了一个不存在的错误处理程序,就会出现错误:
>>> b'\xff'.decode('utf-8', errors='abc')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "../lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
LookupError: unknown error handler name 'abc'