Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-129173: Use _PyUnicodeError_GetParams in PyCodec_SurrogatePassErrors #129134

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 156 additions & 119 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
#define ENC_UTF32LE 4

static int
get_standard_encoding(const char *encoding, int *bytelength)
get_standard_encoding_impl(const char *encoding, int *bytelength)
{
if (Py_TOLOWER(encoding[0]) == 'u' &&
Py_TOLOWER(encoding[1]) == 't' &&
Expand Down Expand Up @@ -1144,165 +1144,202 @@ get_standard_encoding(const char *encoding, int *bytelength)
return ENC_UNKNOWN;
}

/* This handler is declared static until someone demonstrates
a need to call it directly. */

static int
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
{
const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
if (encoding_cstr == NULL) {
return -1;
}
*code = get_standard_encoding_impl(encoding_cstr, bytelength);
return 0;
}


static PyObject *
PyCodec_SurrogatePassErrors(PyObject *exc)
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
PyObject *encode;
const char *encoding;
int code;
int bytelength;
Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
if (encoding == NULL) {
return NULL;
}
int code, bytelength;
int rc = get_standard_encoding(encoding, &code, &bytelength);
Py_DECREF(encoding);
if (rc < 0) {
return NULL;
}
if (code == ENC_UNKNOWN) {
goto bail;
}

if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
unsigned char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
if (code == ENC_UNKNOWN) {
/* Not supported, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(object);
return NULL;
}
PyObject *obj;
Py_ssize_t objlen, start, end, slen;
if (_PyUnicodeError_GetParams(exc,
&obj, &objlen,
&start, &end, &slen, false) < 0)
{
return NULL;
}

if (end - start > PY_SSIZE_T_MAX / bytelength)
end = start + PY_SSIZE_T_MAX / bytelength;
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
if (!res) {
Py_DECREF(object);
return NULL;
if (slen > PY_SSIZE_T_MAX / bytelength) {
end = start + PY_SSIZE_T_MAX / bytelength;
end = Py_MIN(end, objlen);
slen = Py_MAX(0, end - start);
}

PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
if (res == NULL) {
Py_DECREF(obj);
return NULL;
}

unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
for (Py_ssize_t i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* Not a surrogate, fail with original exception */
Py_DECREF(obj);
Py_DECREF(res);
picnixz marked this conversation as resolved.
Show resolved Hide resolved
goto bail;
}
outp = (unsigned char*)PyBytes_AsString(res);
for (i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* Not a surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(res);
Py_DECREF(object);
return NULL;
}
switch (code) {
case ENC_UTF8:
switch (code) {
case ENC_UTF8: {
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
break;
case ENC_UTF16LE:
*outp++ = (unsigned char) ch;
}
case ENC_UTF16LE: {
*outp++ = (unsigned char)ch;
*outp++ = (unsigned char)(ch >> 8);
break;
case ENC_UTF16BE:
}
case ENC_UTF16BE: {
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)ch;
break;
case ENC_UTF32LE:
*outp++ = (unsigned char) ch;
}
case ENC_UTF32LE: {
*outp++ = (unsigned char)ch;
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 24);
break;
case ENC_UTF32BE:
}
case ENC_UTF32BE: {
*outp++ = (unsigned char)(ch >> 24);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)ch;
break;
}
}
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
Py_DECREF(object);
return restuple;
}
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
const unsigned char *p;
Py_UCS4 ch = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeDecodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL;
p = (const unsigned char*)PyBytes_AS_STRING(object);
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
if (code == ENC_UNKNOWN) {
/* Not supported, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(object);
return NULL;
}

/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
p += start;
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
switch (code) {
case ENC_UTF8:
Py_DECREF(obj);
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
return restuple;

bail:
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}


static PyObject *
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
{
PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
if (encoding == NULL) {
return NULL;
}
int code, bytelength;
int rc = get_standard_encoding(encoding, &code, &bytelength);
Py_DECREF(encoding);
if (rc < 0) {
return NULL;
}
if (code == ENC_UNKNOWN) {
goto bail;
}

PyObject *obj;
Py_ssize_t objlen, start, end, slen;
if (_PyUnicodeError_GetParams(exc,
&obj, &objlen,
&start, &end, &slen, true) < 0)
{
return NULL;
}

/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
Py_UCS4 ch = 0;
const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
p += start;

if (objlen - start >= bytelength) {
switch (code) {
case ENC_UTF8: {
if ((p[0] & 0xf0) == 0xe0 &&
(p[1] & 0xc0) == 0x80 &&
(p[2] & 0xc0) == 0x80) {
(p[2] & 0xc0) == 0x80)
{
/* it's a three-byte code */
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
ch = ((p[0] & 0x0f) << 12) +
((p[1] & 0x3f) << 6) +
(p[2] & 0x3f);
}
break;
case ENC_UTF16LE:
}
case ENC_UTF16LE: {
ch = p[1] << 8 | p[0];
break;
case ENC_UTF16BE:
}
case ENC_UTF16BE: {
ch = p[0] << 8 | p[1];
break;
case ENC_UTF32LE:
}
case ENC_UTF32LE: {
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
break;
case ENC_UTF32BE:
}
case ENC_UTF32BE: {
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
break;
}
}
}
Py_DECREF(obj);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
goto bail;
}

Py_DECREF(object);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* it's not a surrogate - fail */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}
res = PyUnicode_FromOrdinal(ch);
if (res == NULL)
return NULL;
return Py_BuildValue("(Nn)", res, start + bytelength);
PyObject *res = PyUnicode_FromOrdinal(ch);
if (res == NULL) {
return NULL;
}
return Py_BuildValue("(Nn)", res, start + bytelength);

bail:
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}


/* This handler is declared static until someone demonstrates
a need to call it directly. */
static PyObject *
PyCodec_SurrogatePassErrors(PyObject *exc)
{
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
}
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
}
else {
wrong_exception_type(exc);
Expand Down
Loading