Skip to content

Commit e60c887

Browse files
authored
Remove PythonAsciiEncoding (#959)
* Remove PythonAsciiEncoding * Fix typo * Align encoding names with CPython
1 parent 406122f commit e60c887

8 files changed

Lines changed: 150 additions & 407 deletions

File tree

Src/IronPython.Modules/_codecs.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ public static void register_error(CodeContext/*!*/ context, [NotNull]string name
4545

4646
public static PythonTuple ascii_decode(CodeContext context, [NotNull]IBufferProtocol input, string? errors = null) {
4747
using var buffer = input.GetBuffer();
48-
return DoDecode(context, "ascii", PythonAsciiEncoding.Instance, buffer, errors).ToPythonTuple();
48+
return DoDecode(context, "ascii", Encoding.ASCII, buffer, errors).ToPythonTuple();
4949
}
5050

5151
public static PythonTuple ascii_encode(CodeContext context, [NotNull]string input, string? errors = null)
52-
=> DoEncode(context, "ascii", PythonAsciiEncoding.Instance, input, errors).ToPythonTuple();
52+
=> DoEncode(context, "ascii", Encoding.ASCII, input, errors).ToPythonTuple();
5353

5454
#endregion
5555

Src/IronPython/Runtime/Operations/MarshalOps.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public static object GetObject (IEnumerator<byte> bytes) {
4242
// True: 'T'
4343
// False: 'F'
4444
// Float: 'f', str len, float in str
45-
// string: 't', int len, bytes (ascii)
45+
// string: 't', int len, bytes (ascii) - obsolete, Python 2 legacy, never used for writing
4646
// string: 'u', int len, bytes (unicode)
4747
// string: 'R' <id> - refer to interned string
4848
// StopIteration: 'S'
@@ -485,7 +485,7 @@ private int ReadInt32 () {
485485
private double ReadFloatStr () {
486486
MoveNext ();
487487

488-
string str = DecodeString (PythonAsciiEncoding.Instance, ReadBytes (_myBytes.Current));
488+
string str = DecodeString (Encoding.ASCII, ReadBytes (_myBytes.Current));
489489

490490
double res = 0;
491491
if (double.TryParse (str, out res)) {
@@ -536,7 +536,8 @@ private object ReadBinaryFloat () {
536536
}
537537

538538
private object ReadAsciiString () {
539-
string res = DecodeString (PythonAsciiEncoding.Instance, ReadBytes (ReadInt32 ()));
539+
// Legacy IronPython 2 behavior, accepts Latin-1
540+
string res = DecodeString (StringOps.Latin1Encoding, ReadBytes (ReadInt32 ()));
540541
_strings[_strings.Count] = res;
541542
return res;
542543
}

Src/IronPython/Runtime/Operations/StringOps.cs

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1682,12 +1682,27 @@ private static bool IsSign(char ch) {
16821682
return ch == '+' || ch == '-';
16831683
}
16841684

1685-
internal static string GetEncodingName(Encoding encoding, bool normalize = true) {
1685+
internal static string GetEncodingName(Encoding encoding, bool normalize = true, string defaultName = "unknown") {
16861686
string? name = null;
16871687

16881688
// if we have a valid code page try and get a reasonable name. The
16891689
// web names / mail displays tend to match CPython's terse names
16901690
if (encoding.CodePage != 0) {
1691+
switch (encoding.CodePage) {
1692+
1693+
// recognize a few common cases
1694+
case 1200: name = (defaultName == "utf-16" && BitConverter.IsLittleEndian) ? defaultName : "utf-16-le"; break;
1695+
case 1201: name = (defaultName == "utf-16" && !BitConverter.IsLittleEndian) ? defaultName : "utf-16-be"; break;
1696+
1697+
case 12000: name = (defaultName == "utf-32" && BitConverter.IsLittleEndian) ? defaultName : "utf-32-le"; break;
1698+
case 12001: name = (defaultName == "utf-32" && !BitConverter.IsLittleEndian) ? defaultName : "utf-32-be"; break;
1699+
1700+
case 20127: name = "ascii"; break;
1701+
case 28591: name = "latin-1"; break;
1702+
1703+
case 65000: name = "utf-7"; break;
1704+
case 65001: name = "utf-8"; break;
1705+
}
16911706
#if !NETCOREAPP && !NETSTANDARD
16921707
if (encoding.IsBrowserDisplay) {
16931708
name = encoding.WebName;
@@ -1699,30 +1714,18 @@ internal static string GetEncodingName(Encoding encoding, bool normalize = true)
16991714
#endif
17001715

17011716
if (name == null) {
1702-
switch (encoding.CodePage) {
1703-
1704-
// recognize a few common cases
1705-
case 1200: name = "utf-16LE"; break;
1706-
case 1201: name = "utf-16BE"; break;
1707-
1708-
case 12000: name = "utf-32LE"; break;
1709-
case 12001: name = "utf-32BE"; break;
1710-
1711-
case 20127: name = "us-ascii"; break;
1712-
case 28591: name = "iso-8859-1"; break;
1713-
1714-
case 65000: name = "utf-7"; break;
1715-
case 65001: name = "utf-8"; break;
1716-
1717-
// otherwise use a code page number which also matches CPython
1718-
default: name = "cp" + encoding.CodePage; break;
1719-
}
1717+
// otherwise use a code page number which also matches CPython
1718+
name = "cp" + encoding.CodePage;
17201719
}
17211720
}
17221721

17231722
if (name == null) {
17241723
// otherwise just finally fall back to the human readable name
1725-
name = encoding.EncodingName;
1724+
try {
1725+
name = encoding.EncodingName; // may throw on .NET Core for some encodings
1726+
} catch (NotSupportedException) {
1727+
name = defaultName;
1728+
}
17261729
}
17271730

17281731
return normalize ? NormalizeEncodingName(name) : name;
@@ -1802,9 +1805,9 @@ Encoding setFallback(Encoding enc, DecoderFallback fb) {
18021805
case "strict": e = setFallback(e, new ExceptionFallback(e is UTF8Encoding)); break;
18031806
case "replace": e = setFallback(e, ReplacementFallback); break;
18041807
case "ignore": e = setFallback(e, new DecoderReplacementFallback(string.Empty)); break;
1805-
case "surrogateescape": e = pe = new PythonSurrogateEscapeEncoding(e, encoding); break;
1806-
case "surrogatepass": e = pe = new PythonSurrogatePassEncoding(e, encoding); break;
1807-
default: e = pe = new PythonErrorHandlerEncoding(context, e, encoding, errors); break;
1808+
case "surrogateescape": e = pe = new PythonSurrogateEscapeEncoding(e); break;
1809+
case "surrogatepass": e = pe = new PythonSurrogatePassEncoding(e); break;
1810+
default: e = pe = new PythonErrorHandlerEncoding(context, e, errors); break;
18081811
}
18091812

18101813
string decoded = string.Empty;
@@ -1821,7 +1824,7 @@ Encoding setFallback(Encoding enc, DecoderFallback fb) {
18211824
}
18221825
} catch (DecoderFallbackException ex) {
18231826
// augmenting the caught exception instead of creating UnicodeDecodeError to preserve the stack trace
1824-
if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = encoding;
1827+
if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = GetEncodingName(e, normalize: false, defaultName: encoding);
18251828
if (!ex.Data.Contains("object")) ex.Data["object"] = Bytes.Make(span.Slice(start, length).ToArray()); ;
18261829
throw;
18271830
}
@@ -1878,9 +1881,9 @@ static Encoding setFallback(Encoding enc, EncoderFallback fb) {
18781881
case "backslashreplace": e = setFallback(e, new BackslashEncoderReplaceFallback()); break;
18791882
case "xmlcharrefreplace": e = setFallback(e, new XmlCharRefEncoderReplaceFallback()); break;
18801883
case "ignore": e = setFallback(e, new EncoderReplacementFallback(string.Empty)); break;
1881-
case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e, encoding); break;
1882-
case "surrogatepass": e = new PythonSurrogatePassEncoding(e, encoding); break;
1883-
default: e = new PythonErrorHandlerEncoding(context, e, encoding, errors); break;
1884+
case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e); break;
1885+
case "surrogatepass": e = new PythonSurrogatePassEncoding(e); break;
1886+
default: e = new PythonErrorHandlerEncoding(context, e, errors); break;
18841887
}
18851888

18861889
byte[]? preamble = includePreamble ? e.GetPreamble() : null;
@@ -1893,7 +1896,7 @@ static Encoding setFallback(Encoding enc, EncoderFallback fb) {
18931896
}
18941897
e.GetBytes(s, 0, s.Length, bytes, preambleLen);
18951898
} catch (EncoderFallbackException ex) {
1896-
if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = encoding;
1899+
if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = GetEncodingName(e, normalize: false, defaultName: encoding);
18971900
if (!ex.Data.Contains("object")) ex.Data["object"] = s;
18981901
throw;
18991902
}
@@ -1951,7 +1954,7 @@ static CodecsInfo() {
19511954
d["iso_8859_1"] = d["iso8859_1"] = d["8859"] = d["iso8859"]
19521955
= d["cp28591"] = d["28591"] = d["cp819"] = d["819"]
19531956
= d["latin_1"] = d["latin1"] = d["latin"] = d["l1"] = makeEncodingProxy(() => Latin1Encoding);
1954-
d["cp20127"] = d["us_ascii"] = d["us"] = d["ascii"] = d["646"] = makeEncodingProxy(() => PythonAsciiEncoding.Instance);
1957+
d["cp20127"] = d["us_ascii"] = d["us"] = d["ascii"] = d["646"] = makeEncodingProxy(() => Encoding.ASCII);
19551958
d["cp65000"] = d["utf_7"] = d["u7"] = d["unicode_1_1_utf_7"] = makeEncodingProxy(() => new UTF7Encoding(allowOptionals: true));
19561959
d["cp65001"] = d["utf_8"] = d["utf8"] = d["u8"] = makeEncodingProxy(() => new UTF8Encoding(encoderShouldEmitUTF8Identifier: false));
19571960
d["utf_8_sig"] = makeEncodingProxy(() => new UTF8Encoding(encoderShouldEmitUTF8Identifier: true));

0 commit comments

Comments
 (0)