Skip to content

Commit af08446

Browse files
committed
[mypyc] Fix b64decode to match new cpython behavior
1 parent a399e1c commit af08446

2 files changed

Lines changed: 29 additions & 31 deletions

File tree

mypyc/lib-rt/base64/librt_base64.c

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -240,39 +240,21 @@ b64decode_handle_invalid_input(
240240
return PyErr_NoMemory();
241241
}
242242

243-
// Copy base64 characters and some padding to the new buffer
243+
int pad_chars = 0;
244+
// Copy base64 characters to the new buffer. Ignore padding to conform to RFC 4648 section 3.3.
244245
for (size_t i = 0; i < srclen; i++) {
245246
char c = src[i];
246247
if (is_valid_base64_char(c, false)) {
247248
newbuf[newbuf_len++] = c;
249+
pad_chars = 0;
248250
} else if (c == '=') {
249-
// Copy a necessary amount of padding
250-
int remainder = newbuf_len % 4;
251-
if (remainder == 0) {
252-
// No padding needed
253-
break;
254-
}
255-
int numpad = 4 - remainder;
256-
// Check that there is at least the required amount padding (CPython ignores
257-
// extra padding)
258-
while (numpad > 0) {
259-
if (i == srclen || src[i] != '=') {
260-
break;
261-
}
262-
newbuf[newbuf_len++] = '=';
263-
i++;
264-
numpad--;
265-
// Skip non-base64 alphabet characters within padding
266-
while (i < srclen && !is_valid_base64_char(src[i], true)) {
267-
i++;
268-
}
269-
}
270-
break;
251+
pad_chars++;
271252
}
272253
}
273254

255+
int quad_pos = newbuf_len % 4;
274256
// Stdlib always performs a non-strict padding check
275-
if (newbuf_len % 4 != 0) {
257+
if (quad_pos != 0 && quad_pos + pad_chars < 4) {
276258
if (freesrc) {
277259
PyMem_Free((void *)src);
278260
}
@@ -282,6 +264,15 @@ b64decode_handle_invalid_input(
282264
return NULL;
283265
}
284266

267+
if (quad_pos != 0) {
268+
// Add padding at the end to make the input length a multiple of 4. We know that this padding
269+
// is present in src because otherwise we would report the "Incorrect padding" error above.
270+
while (quad_pos < 4) {
271+
newbuf[newbuf_len++] = '=';
272+
quad_pos++;
273+
}
274+
}
275+
285276
size_t outlen = max_out;
286277
int ret = base64_decode(newbuf, newbuf_len, outbuf, &outlen, 0);
287278
PyMem_Free(newbuf);

mypyc/test-data/run-base64.test

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ from typing import Any, cast
33
import base64
44
import binascii
55
import random
6+
import sys
67

78
from librt.base64 import b64encode, b64decode, urlsafe_b64encode, urlsafe_b64decode
89

@@ -121,6 +122,14 @@ def test_decode_with_non_base64_chars() -> None:
121122
check_decode(b"e" + b + b"A==", encoded=True)
122123
check_decode(b"eA=" + b + b"=", encoded=True)
123124

125+
def has_stdlib_b64decode_bugfix() -> bool:
126+
# stdlib b64decode has a bug in older python versions where it skips processing the input data
127+
# after the first padded quad. It was changed to conform to RFC 4648 section 3.3 in cpython 3.13.13+,
128+
# 3.14.4+ and 3.15+. The librt implementation was changed to match the correct behavior regardless
129+
# of python version so some inputs result in different results than stdlib on older python.
130+
_, minor, micro, _, _ = sys.version_info
131+
return minor > 14 or (minor == 14 and micro >= 4) or (minor == 13 and micro >= 13)
132+
124133
def check_decode_error(b: bytes, ignore_stdlib: bool = False) -> None:
125134
if not ignore_stdlib:
126135
with assertRaises(binascii.Error):
@@ -135,9 +144,7 @@ def test_decode_with_invalid_padding() -> None:
135144
check_decode_error(b"eA=")
136145
check_decode_error(b"eHk")
137146
check_decode_error(b"eA = ")
138-
139-
# Here stdlib behavior seems nonsensical, so we don't try to duplicate it
140-
check_decode_error(b"eA=a=", ignore_stdlib=True)
147+
check_decode_error(b"eA==x", ignore_stdlib=not has_stdlib_b64decode_bugfix())
141148

142149
def test_decode_with_extra_data_after_padding() -> None:
143150
check_decode(b"=", encoded=True)
@@ -146,10 +153,10 @@ def test_decode_with_extra_data_after_padding() -> None:
146153
check_decode(b"====", encoded=True)
147154
check_decode(b"eA===", encoded=True)
148155
check_decode(b"eHk==", encoded=True)
149-
# TODO: behavior in these cases changed in Python 3.14.4, we should match that.
150-
# check_decode(b"eA==x", encoded=True)
151-
# check_decode(b"eHk=x", encoded=True)
152-
# check_decode(b"eA==abc=======efg", encoded=True)
156+
if has_stdlib_b64decode_bugfix():
157+
check_decode(b"eA=a=", encoded=True)
158+
check_decode(b"eHk=x", encoded=True)
159+
check_decode(b"eA==abc=======efg", encoded=True)
153160

154161
def test_decode_wrappers() -> None:
155162
funcs: list[Any] = [b64decode, urlsafe_b64decode]

0 commit comments

Comments
 (0)