New 'big5bbs' encoding! (jquast#136)

jquast · web-flow · commit fe61e8515e99 · 2026-03-05T05:58:48.000-05:00
Sometimes called "BBS 半形字" (half-width characters), and the
PCMan/PttBBS terminal clients handle it with a custom font that maps
those lone high bytes to specific glyphs.  It's very popular within
Taiwanese BBS culture but has no formal name or IANA-registered codec.

It's essentially Big5 + a proprietary single-byte overlay for art
characters.
diff --git a/telnetlib3/encodings/big5bbs.py b/telnetlib3/encodings/big5bbs.py
@@ -0,0 +1,158 @@
+r"""
+Big5-BBS hybrid codec for Taiwanese BBS systems.
+
+Traditional Taiwanese BBS systems (PttBBS, DreamBBS, etc.) send a byte stream
+that mixes Big5-encoded Chinese text with single-byte half-width (半形) art
+characters whose byte values (0xA1-0xFE) overlap with Big5 lead bytes.  These
+lone high bytes appear immediately before ANSI escape sequences (``\x1b[...``)
+and cannot form valid Big5 pairs since ESC (0x1B) is not a valid Big5 second
+byte (which must be 0x40-0x7E or 0xA1-0xFE).
+
+Decoding algorithm:
+
+- When a Big5 lead byte (0xA1-0xFE) is followed by a valid Big5 second byte
+  (0x40-0x7E or 0xA1-0xFE), the pair is decoded as Big5.
+- When a lead byte is followed by any other byte (e.g. ESC), the lone lead
+  byte is decoded via CP437 and the following byte is re-processed.
+- Bytes below 0xA1 are decoded via latin-1 (identical to ASCII for 0x00-0x7F).
+"""
+
+# std imports
+import codecs
+from typing import Tuple, Union
+
+
+class Codec(codecs.Codec):
+    """Big5-BBS stateless codec (decodes entire buffer at once with final=True)."""
+
+    def encode(self, input: str, errors: str = "strict") -> Tuple[bytes, int]:
+        """Encode string to bytes, preferring Big5 with CP437 fallback per character."""
+        result = []
+        for char in input:
+            try:
+                result.append(char.encode("big5"))
+            except UnicodeEncodeError:
+                encoded, _ = codecs.charmap_encode(char, errors, _CP437_ENCODING_TABLE)
+                result.append(encoded)
+        return b"".join(result), len(input)
+
+    def decode(self, input: bytes, errors: str = "strict") -> Tuple[str, int]:
+        """Decode bytes using Big5/CP437 hybrid algorithm."""
+        dec = IncrementalDecoder(errors)
+        return dec.decode(input, final=True), len(input)
+
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    """Big5-BBS incremental encoder; Big5 primary, CP437 fallback."""
+
+    def encode(self, input: str, final: bool = False) -> bytes:
+        """Encode input string incrementally."""
+        result = []
+        for char in input:
+            try:
+                result.append(char.encode("big5"))
+            except UnicodeEncodeError:
+                encoded, _ = codecs.charmap_encode(char, self.errors, _CP437_ENCODING_TABLE)
+                result.append(encoded)
+        return b"".join(result)
+
+    def reset(self) -> None:
+        """Reset encoder state (stateless; no-op)."""
+
+    def getstate(self) -> int:
+        """Return encoder state."""
+        return 0
+
+    def setstate(self, state: Union[int, str]) -> None:
+        """Restore encoder state."""
+
+
+class IncrementalDecoder(codecs.IncrementalDecoder):
+    """
+    Big5-BBS incremental decoder with one-byte lookahead.
+
+    Holds at most one pending Big5 lead byte between calls.
+    """
+
+    def __init__(self, errors: str = "strict") -> None:
+        """Initialize decoder with empty pending buffer."""
+        super().__init__(errors)
+        self._buf: bytes = b""
+
+    def decode(self, input: bytes, final: bool = False) -> str:  # type: ignore[override]
+        """Decode input bytes using Big5/CP437 hybrid algorithm."""
+        data = self._buf + input
+        self._buf = b""
+        result = []
+        i = 0
+        while i < len(data):
+            b = data[i]
+            if 0xA1 <= b <= 0xFE:
+                if i + 1 < len(data):
+                    b2 = data[i + 1]
+                    if (0x40 <= b2 <= 0x7E) or (0xA1 <= b2 <= 0xFE):
+                        result.append(bytes([b, b2]).decode("big5", errors=self.errors))
+                        i += 2
+                    else:
+                        result.append(bytes([b]).decode("cp437"))
+                        i += 1
+                elif not final:
+                    self._buf = bytes([b])
+                    break
+                else:
+                    result.append(bytes([b]).decode("cp437"))
+                    i += 1
+            else:
+                result.append(bytes([b]).decode("latin-1"))
+                i += 1
+        return "".join(result)
+
+    def reset(self) -> None:
+        """Reset decoder state."""
+        self._buf = b""
+
+    def getstate(self) -> Tuple[bytes, int]:
+        """Return decoder state as (buffer, flags) tuple."""
+        return (self._buf, 0)
+
+    def setstate(self, state: Tuple[bytes, int]) -> None:
+        """Restore decoder state from (buffer, flags) tuple."""
+        self._buf = state[0]
+
+
+class StreamWriter(Codec, codecs.StreamWriter):
+    """Big5-BBS stream writer."""
+
+
+class StreamReader(Codec, codecs.StreamReader):
+    """Big5-BBS stream reader."""
+
+
+def getregentry() -> codecs.CodecInfo:
+    """Return the codec registry entry."""
+    return codecs.CodecInfo(
+        name="big5bbs",
+        encode=Codec().encode,
+        decode=Codec().decode,  # type: ignore[arg-type]
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )
+
+
+def getaliases() -> Tuple[str, ...]:
+    """Return codec aliases (normalized: hyphens replaced with underscores)."""
+    return ("big5_bbs",)
+
+
+def _build_cp437_encoding_table() -> dict[int, int]:
+    """Build a Unicode ordinal → byte value map for CP437."""
+    table: dict[int, int] = {}
+    for byte_val in range(256):
+        char = bytes([byte_val]).decode("cp437")
+        table[ord(char)] = byte_val
+    return table
+
+
+_CP437_ENCODING_TABLE = _build_cp437_encoding_table()
diff --git a/telnetlib3/tests/test_big5bbs_codec.py b/telnetlib3/tests/test_big5bbs_codec.py
@@ -0,0 +1,181 @@
+"""Tests for the Big5-BBS hybrid codec."""
+
+# std imports
+import codecs
+
+# 3rd party
+import pytest
+
+# local
+import telnetlib3  # noqa: F401
+
+
+def test_codec_lookup():
+    info = codecs.lookup("big5bbs")
+    assert info.name == "big5bbs"
+
+
+def test_codec_alias_hyphen():
+    codecs.lookup("big5bbs")
+    info = codecs.lookup("big5-bbs")
+    assert info.name == "big5bbs"
+
+
+def test_codec_alias_underscore():
+    codecs.lookup("big5bbs")
+    info = codecs.lookup("big5_bbs")
+    assert info.name == "big5bbs"
+
+
+def test_ascii_passthrough():
+    data = b"Hello, World!\n"
+    assert data.decode("big5bbs") == "Hello, World!\n"
+
+
+def test_valid_big5_pair():
+    # Encode a known CJK character to Big5 and verify round-trip
+    char = "夢"
+    big5_bytes = char.encode("big5")
+    assert big5_bytes.decode("big5bbs") == char
+
+
+def test_valid_big5_pair_in_context():
+    # Inline Big5 pair between ASCII text
+    char = "夢"
+    big5_bytes = char.encode("big5")
+    data = b"test" + big5_bytes + b"end"
+    assert data.decode("big5bbs") == "test" + char + "end"
+
+
+def test_lone_lead_0xa1_before_esc():
+    # 0xA1 is a Big5 lead byte; ESC is not a valid second byte → CP437 fallback
+    # CP437 0xA1 = í (LATIN SMALL LETTER I WITH ACUTE)
+    data = bytes([0xA1, 0x1B, 0x5B, 0x33, 0x32, 0x6D])  # 0xA1 ESC[32m
+    result = data.decode("big5bbs")
+    assert result[0] == "\u00ed"  # í
+    assert result[1:] == "\x1b[32m"
+
+
+def test_lone_lead_0xb0_before_esc():
+    # CP437 0xB0 = ░ (LIGHT SHADE, U+2591)
+    data = bytes([0xB0, 0x1B])
+    result = data.decode("big5bbs")
+    assert result == "\u2591\x1b"
+
+
+def test_lone_lead_0xb6_before_esc():
+    # CP437 0xB6 = ╢ (BOX DRAWINGS LIGHT VERTICAL AND LEFT, U+2562)
+    data = bytes([0xB6, 0x1B])
+    result = data.decode("big5bbs")
+    assert result == "\u2562\x1b"
+
+
+@pytest.mark.parametrize(
+    "byte_val,expected_unicode",
+    [
+        (0xA1, "\u00ed"),  # í
+        (0xA2, "\u00f3"),  # ó
+        (0xA8, "\u00bf"),  # ¿
+        (0xA9, "\u2310"),  # ⌐
+        (0xAA, "\u00ac"),  # ¬
+        (0xAB, "\u00bd"),  # ½
+        (0xB0, "\u2591"),  # ░
+        (0xB6, "\u2562"),  # ╢
+        (0xBF, "\u2510"),  # ┐
+        (0xC3, "\u251c"),  # ├
+        (0xC6, "\u255e"),  # ╞
+        (0xC7, "\u255f"),  # ╟
+        (0xCA, "\u2569"),  # ╩
+        (0xD1, "\u2564"),  # ╤
+        (0xEE, "\u03b5"),  # ε
+        (0xEF, "\u2229"),  # ∩
+    ],
+)
+def test_lone_lead_bytes_cp437_fallback(byte_val, expected_unicode):
+    # Each lone lead byte followed by ESC (invalid second byte) falls back to CP437
+    data = bytes([byte_val, 0x1B])
+    result = data.decode("big5bbs")
+    assert result[0] == expected_unicode
+    assert result[1] == "\x1b"
+
+
+def test_split_across_chunks_big5_pair():
+    # Lead byte in chunk 1, second byte in chunk 2 → valid Big5 pair
+    char = "夢"
+    big5_bytes = char.encode("big5")
+    assert len(big5_bytes) == 2
+    decoder = codecs.getincrementaldecoder("big5bbs")()
+    result1 = decoder.decode(big5_bytes[:1], final=False)
+    assert result1 == ""  # buffered
+    result2 = decoder.decode(big5_bytes[1:], final=False)
+    assert result2 == char
+
+
+def test_split_lead_byte_final_true():
+    # Lead byte at end of stream with final=True → CP437 fallback
+    decoder = codecs.getincrementaldecoder("big5bbs")()
+    result = decoder.decode(bytes([0xB0]), final=True)
+    assert result == "\u2591"  # ░
+
+
+def test_split_lead_byte_not_final():
+    # Lead byte with final=False → buffered, returns empty string
+    decoder = codecs.getincrementaldecoder("big5bbs")()
+    result = decoder.decode(bytes([0xB0]), final=False)
+    assert result == ""
+
+
+def test_round_trip_big5_text():
+    text = "夢想台灣"
+    encoded = text.encode("big5bbs")
+    decoded = encoded.decode("big5bbs")
+    assert decoded == text
+
+
+def test_getstate_setstate_preserves_pending_byte():
+    decoder = codecs.getincrementaldecoder("big5bbs")()
+    char = "夢"
+    big5_bytes = char.encode("big5")
+    decoder.decode(big5_bytes[:1], final=False)
+    state = decoder.getstate()
+    assert state[0] == big5_bytes[:1]
+
+    decoder2 = codecs.getincrementaldecoder("big5bbs")()
+    decoder2.setstate(state)
+    result = decoder2.decode(big5_bytes[1:], final=True)
+    assert result == char
+
+
+def test_reset_clears_pending_byte():
+    decoder = codecs.getincrementaldecoder("big5bbs")()
+    char = "夢"
+    big5_bytes = char.encode("big5")
+    decoder.decode(big5_bytes[:1], final=False)
+    decoder.reset()
+    state = decoder.getstate()
+    assert state[0] == b""
+
+
+def test_mixed_stream():
+    # Simulate a BBS art stream: Chinese text + lone art bytes + ANSI escape
+    char = "夢"
+    big5_bytes = char.encode("big5")
+    data = big5_bytes + bytes([0xB0, 0x1B, 0x5B, 0x33, 0x32, 0x6D]) + b"text"
+    result = data.decode("big5bbs")
+    assert result == char + "\u2591\x1b[32mtext"
+
+
+def test_incremental_encoder_ascii():
+    encoder = codecs.getincrementalencoder("big5bbs")()
+    assert encoder.encode("Hello") == b"Hello"
+
+
+def test_incremental_encoder_big5():
+    encoder = codecs.getincrementalencoder("big5bbs")()
+    char = "夢"
+    assert encoder.encode(char) == char.encode("big5")
+
+
+def test_incremental_encoder_getstate():
+    encoder = codecs.getincrementalencoder("big5bbs")()
+    assert encoder.getstate() == 0