Skip to content

Commit 6c9880d

Browse files
authored
Fix BOM handling for custom error handlers (#960)
1 parent e60c887 commit 6c9880d

2 files changed

Lines changed: 21 additions & 7 deletions

File tree

Src/IronPython/Runtime/PythonEncoding.cs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,7 @@ public string GetString(IPythonBuffer input, int index, int count) {
790790
var fbuf1 = GetPythonDecoderFallbackBuffer(_pass1decoder);
791791

792792
// This allows for UnicodeDecodeError, if occurred, to contain the whole input
793-
if (fbuf1 != null) fbuf1.Data = input;
793+
if (fbuf1 != null) fbuf1.Data = Tuple.Create(input, index);
794794

795795
var span = input.AsReadOnlySpan().Slice(index, count);
796796
int len = _pass1decoder.GetCharCount(span, flush: true);
@@ -838,7 +838,7 @@ public PythonDecoderFallbackBuffer(bool isPass1, PythonEncoding encoding) {
838838
protected bool DecodingMode { get; private set; }
839839
protected int EncodingCharWidth { get; }
840840
protected int CodePage { get; }
841-
public IPythonBuffer? Data { get; set; }
841+
public Tuple<IPythonBuffer, int>? Data { get; set; }
842842

843843
public virtual void PrepareIncrement(bool forDecoding) {
844844
if (DecodingMode) {
@@ -1374,22 +1374,24 @@ public override ReadOnlyMemory<char> GetFallbackChars(byte[] bytesUnknown, int i
13741374
int pos;
13751375
if (_bytesData == null) {
13761376
if (Data != null) {
1377+
IPythonBuffer buf = Data.Item1;
1378+
int dataOffset = Data.Item2;
13771379
if (index < 0) {
13781380
// corner case, the unknown data starts at the end of the previous increment (or earlier)
13791381
if (_previousData.Length < -index)
13801382
throw new NotImplementedException($"Not enough lookback bytes to process decoding of this increment, increase '{nameof(MinNumLookbackBytes)}'");
1381-
var dataSpan = Data.AsReadOnlySpan();
1383+
var dataSpan = buf.AsReadOnlySpan().Slice(dataOffset);
13821384
var extData = new byte[-index + dataSpan.Length];
13831385
Array.Copy(_previousData, _previousData.Length + index, extData, 0, -index);
13841386
dataSpan.CopyTo(extData.AsSpan(-index));
13851387
bytesObj = _bytesData = Bytes.Make(extData);
13861388
pos = 0;
13871389
} else {
1388-
if (Data.Object is Bytes bytes && bytes.Count == Data.NumBytes()) {
1390+
if (buf.Object is Bytes bytes && bytes.Count == buf.NumBytes() && dataOffset == 0) {
13891391
// fast track, no data copy
13901392
bytesObj = _bytesData = new Bytes(bytes);
13911393
} else {
1392-
bytesObj = _bytesData = Bytes.Make(Data.AsReadOnlySpan().ToArray());
1394+
bytesObj = _bytesData = Bytes.Make(buf.AsReadOnlySpan().Slice(dataOffset).ToArray());
13931395
}
13941396
pos = index;
13951397
}
@@ -1402,7 +1404,9 @@ public override ReadOnlyMemory<char> GetFallbackChars(byte[] bytesUnknown, int i
14021404
} else {
14031405
bytesObj = _bytesData;
14041406
// if _bytesData is not null, Data is not null also
1405-
pos = index + _bytesData.Count - Data!.NumBytes();
1407+
IPythonBuffer buf = Data!.Item1;
1408+
int dataOffset = Data.Item2;
1409+
pos = index + _bytesData.Count - buf.NumBytes() + dataOffset;
14061410
}
14071411

14081412
// create exception object to hand over to the user-function...
@@ -1440,7 +1444,9 @@ public override void FinalizeIncrement(int endIndex, bool flush) {
14401444
if (flush) {
14411445
_previousDataLen = 0;
14421446
} else if (Data != null) {
1443-
var span = Data.AsReadOnlySpan();
1447+
IPythonBuffer buf = Data!.Item1;
1448+
int dataOffset = Data.Item2;
1449+
var span = buf.AsReadOnlySpan().Slice(dataOffset);
14441450
int retain = _previousData.Length - span.Length;
14451451
if (retain <= 0) {
14461452
_previousDataLen = 0;

Tests/modules/io_related/test_codecs.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,14 @@ def test_encoding_error_eqhandler(ue):
958958
self.assertEqual(codecs.ascii_decode(data, 'test_dec_eq'), ("az", 4))
959959
self.assertEqual(codecs.charmap_decode(data, 'test_dec_eq', {ord('a'): 'a', ord('z'): 'z'}), ("az", 4))
960960

961+
# Test that BOM is properly accounted for
962+
data = b"a\x00\xDD\xDDz\x00"
963+
def test_encoding_error_bomhandler(ue):
964+
self.assertEqual(ue.object[ue.start:ue.end], b"\xDD\xDD")
965+
return ("", ue.end)
966+
codecs.register_error('test_bom', test_encoding_error_bomhandler)
967+
self.assertEqual(codecs.utf_16_decode(codecs.BOM_UTF16_LE + data, 'test_bom'), ("az", 8))
968+
961969
def test_lookup_error(self):
962970
#sanity
963971
self.assertRaises(LookupError, codecs.lookup_error, "blah garbage xyz")

0 commit comments

Comments
 (0)