Skip to content

Commit 406122f

Browse files
authored
Support custom decoding error handler (#956)
* Support custom decoding error handler * Eliminate data copy when decoding IPythonBuffer with PythonEncoding * Eliminate data copy when decoding IPythonBuffer with PythonEncoding on net46 * Disable failing test on Mono * Eliminate second encoding pass for common cases * Update after review * Disable failing test on Mono
1 parent 23bbd6e commit 406122f

6 files changed

Lines changed: 610 additions & 289 deletions

File tree

Src/IronPython/Runtime/Operations/StringOps.cs

Lines changed: 27 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -1780,6 +1780,9 @@ internal static string RawDecode(CodeContext/*!*/ context, IBufferProtocol data,
17801780
private static DecoderFallback ReplacementFallback = new DecoderReplacementFallback("\ufffd");
17811781

17821782
internal static string DoDecode(CodeContext context, IPythonBuffer buffer, string? errors, string encoding, Encoding e, int numBytes = -1) {
1783+
// Precondition: only bytes-like buffers accepted
1784+
Debug.Assert(buffer.IsCContiguous());
1785+
17831786
var span = buffer.AsReadOnlySpan();
17841787
int start = GetStartingOffset(span, e);
17851788
int length = (numBytes >= 0 ? numBytes : span.Length) - start;
@@ -1791,40 +1794,35 @@ Encoding setFallback(Encoding enc, DecoderFallback fb) {
17911794
enc.DecoderFallback = fb;
17921795
return enc;
17931796
}
1797+
PythonEncoding? pe = null; // to avoid downcasting later
17941798
switch (errors) {
17951799
case null:
17961800
case "backslashreplace":
17971801
case "xmlcharrefreplace":
17981802
case "strict": e = setFallback(e, new ExceptionFallback(e is UTF8Encoding)); break;
17991803
case "replace": e = setFallback(e, ReplacementFallback); break;
1800-
case "ignore": e = setFallback(e, new PythonDecoderFallback(encoding, buffer, start)); break;
1801-
case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e, encoding); break;
1802-
case "surrogatepass": e = new PythonSurrogatePassEncoding(e, encoding); break;
1803-
default:
1804-
e = setFallback(e, new PythonDecoderFallback(encoding,
1805-
buffer, start,
1806-
() => LightExceptions.CheckAndThrow(PythonOps.LookupEncodingError(context, errors))));
1807-
break;
1804+
case "ignore": e = setFallback(e, new DecoderReplacementFallback(string.Empty)); break;
1805+
case "surrogateescape": e = pe = new PythonSurrogateEscapeEncoding(e, encoding); break;
1806+
case "surrogatepass": e = pe = new PythonSurrogatePassEncoding(e, encoding); break;
1807+
default: e = pe = new PythonErrorHandlerEncoding(context, e, encoding, errors); break;
18081808
}
18091809

18101810
string decoded = string.Empty;
18111811
try {
1812-
unsafe {
1813-
fixed (byte* ptr = span.Slice(start)) {
1814-
if (ptr != null) {
1815-
if (e is UnicodeEscapeEncoding ue) {
1816-
// This overload is not virtual, but the base implementation is inefficient for this encoding
1817-
decoded = ue.GetString(ptr, length);
1818-
} else {
1819-
decoded = e.GetString(ptr, length);
1820-
}
1821-
}
1812+
if (pe != null) {
1813+
decoded = pe.GetString(buffer, start, length);
1814+
} else {
1815+
if (e is UnicodeEscapeEncoding ue) {
1816+
// This overload is not virtual, but the base implementation is inefficient for this encoding
1817+
decoded = ue.GetString(span.Slice(start, length));
1818+
} else {
1819+
decoded = e.GetString(span.Slice(start, length));
18221820
}
18231821
}
18241822
} catch (DecoderFallbackException ex) {
18251823
// augmenting the caught exception instead of creating UnicodeDecodeError to preserve the stack trace
1826-
ex.Data["encoding"] = encoding;
1827-
ex.Data["object"] = Bytes.Make(span.Slice(start, length).ToArray());
1824+
if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = encoding;
1825+
if (!ex.Data.Contains("object")) ex.Data["object"] = Bytes.Make(span.Slice(start, length).ToArray()); ;
18281826
throw;
18291827
}
18301828

@@ -2219,6 +2217,8 @@ private string EscapeEncode(string s, int index, int count) {
22192217
ReprEncode(s, index, count, isUniEscape: true);
22202218
}
22212219

2220+
public override string EncodingName => _raw ? "rawunicodeescape" : "unicodeescape";
2221+
22222222
public override int GetByteCount(string s)
22232223
=> EscapeEncode(s, 0, s.Length).Length;
22242224

@@ -2240,10 +2240,12 @@ public override int GetBytes(char[] chars, int charIndex, int charCount, byte[]
22402240
public override string GetString(byte[] bytes, int index, int count)
22412241
=> LiteralParser.ParseString(bytes, index, count, _raw, GetErrorHandler());
22422242

2243-
public new unsafe string GetString(byte* bytes, int byteCount) {
2244-
var data = new ReadOnlySpan<byte>(bytes, byteCount);
2245-
return LiteralParser.ParseString(data, _raw, GetErrorHandler());
2246-
}
2243+
#if NETCOREAPP
2244+
public new string GetString(ReadOnlySpan<byte> bytes)
2245+
#else
2246+
public string GetString(ReadOnlySpan<byte> bytes)
2247+
#endif
2248+
=> LiteralParser.ParseString(bytes, _raw, GetErrorHandler());
22472249

22482250
public override unsafe int GetCharCount(byte* bytes, int count)
22492251
=> LiteralParser.ParseString(new ReadOnlySpan<byte>(bytes, count), _raw, GetErrorHandler()).Length;
@@ -2303,127 +2305,6 @@ public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[]
23032305

23042306
#region Unicode Encode/Decode Fallback Support
23052307

2306-
/// When encoding or decoding strings if an error occurs CPython supports several different
2307-
/// behaviors, in addition it supports user-extensible behaviors as well. For the default
2308-
/// behavior we're ok - both of us support throwing and replacing. For custom behaviors
2309-
/// we define a single fallback for decoding and encoding that calls the python function to do
2310-
/// the replacement.
2311-
///
2312-
/// When we do the replacement we call the provided handler w/ a UnicodeEncodeError or UnicodeDecodeError
2313-
/// object which contains:
2314-
/// encoding (string, the encoding the user requested)
2315-
/// object (the original string or bytes being encoded/decoded)
2316-
/// start (the start of the invalid sequence)
2317-
/// end (the exclusive end of the invalid sequence)
2318-
/// reason (the error message, e.g. 'unexpected byte code', not sure of others)
2319-
///
2320-
/// The decoder returns a tuple of (str, int) where str is the replacement string
2321-
/// and int is an index where encoding/decoding should continue.
2322-
/// TODO: returned int is currently ignored, assumed to be equal to end (i.e. the index is not adjusted).
2323-
2324-
private class PythonDecoderFallbackBuffer : DecoderFallbackBuffer {
2325-
private readonly object? _function;
2326-
private readonly string _encoding;
2327-
private readonly IPythonBuffer _data;
2328-
private readonly int _offset;
2329-
private Bytes? _byteData;
2330-
private string? _buffer;
2331-
private int _bufferIndex;
2332-
2333-
public PythonDecoderFallbackBuffer(string encoding, IPythonBuffer data, int offset, object? callable) {
2334-
_encoding = encoding;
2335-
_data = data;
2336-
_offset = offset;
2337-
_function = callable;
2338-
}
2339-
2340-
public override int Remaining {
2341-
get {
2342-
if (_buffer == null) return 0;
2343-
return _buffer.Length - _bufferIndex;
2344-
}
2345-
}
2346-
2347-
public override char GetNextChar() {
2348-
if (_buffer == null || _bufferIndex >= _buffer.Length) return Char.MinValue;
2349-
2350-
return _buffer[_bufferIndex++];
2351-
}
2352-
2353-
public override bool MovePrevious() {
2354-
if (_bufferIndex > 0) {
2355-
_bufferIndex--;
2356-
return true;
2357-
}
2358-
return false;
2359-
}
2360-
2361-
public override void Reset() {
2362-
_buffer = null;
2363-
_bufferIndex = 0;
2364-
base.Reset();
2365-
}
2366-
2367-
public override bool Fallback(byte[] bytesUnknown, int index) {
2368-
if (_function != null) {
2369-
// create the exception object to hand to the user-function...
2370-
_byteData ??= Bytes.Make(_data.AsReadOnlySpan().Slice(_offset).ToArray());
2371-
var exObj = PythonExceptions.CreatePythonThrowable(PythonExceptions.UnicodeDecodeError, _encoding, _byteData, index, index + bytesUnknown.Length, "unexpected code byte");
2372-
2373-
// call the user function...
2374-
object? res = PythonCalls.Call(_function, exObj);
2375-
2376-
string replacement = CheckReplacementTuple(res, "decoding", index + bytesUnknown.Length);
2377-
2378-
// finally process the user's request.
2379-
_buffer = replacement;
2380-
_bufferIndex = 0;
2381-
return true;
2382-
}
2383-
2384-
return false;
2385-
}
2386-
2387-
}
2388-
2389-
private class PythonDecoderFallback : DecoderFallback {
2390-
private readonly string encoding;
2391-
private readonly IPythonBuffer data;
2392-
private readonly int offset;
2393-
private readonly Func<object>? lookup;
2394-
private object? function;
2395-
2396-
public PythonDecoderFallback(string encoding, IPythonBuffer data, int offset, Func<object>? lookup = null) {
2397-
this.encoding = encoding;
2398-
this.data = data;
2399-
this.offset = offset;
2400-
this.lookup = lookup;
2401-
}
2402-
2403-
public override DecoderFallbackBuffer CreateFallbackBuffer() {
2404-
if (function == null && lookup != null) {
2405-
function = lookup.Invoke();
2406-
}
2407-
return new PythonDecoderFallbackBuffer(encoding, data, offset, function);
2408-
}
2409-
2410-
public override int MaxCharCount {
2411-
get { throw new NotImplementedException(); }
2412-
}
2413-
}
2414-
2415-
private static string CheckReplacementTuple(object? res, string encodeOrDecode, int cursorPos) {
2416-
// verify the result is sane...
2417-
if (res is PythonTuple tres && tres.__len__() == 2
2418-
&& Converter.TryConvertToString(tres[0], out string? replacement)
2419-
&& Converter.TryConvertToInt32(tres[1], out int newPos)) {
2420-
if (newPos != cursorPos) throw new NotImplementedException($"Moving {encodeOrDecode} cursor not implemented yet");
2421-
return replacement;
2422-
}
2423-
2424-
throw PythonOps.TypeError("{1} error handler must return tuple containing (str, int), got {0}", PythonOps.GetPythonTypeName(res), encodeOrDecode);
2425-
}
2426-
24272308
private class BackslashEncoderReplaceFallback : EncoderFallback {
24282309
private class BackslashReplaceFallbackBuffer : EncoderFallbackBuffer {
24292310
private List<char> _buffer = new List<char>();
@@ -2931,6 +2812,6 @@ internal static void IdentifyUtfEncoding(string encodingName, out int charWidth,
29312812
}
29322813
}
29332814

2934-
#endregion
2815+
#endregion
29352816
}
29362817
}

0 commit comments

Comments
 (0)