Skip to content

Commit fe61e85

Browse files
authored
New 'big5bbs' encoding! (jquast#136)
Sometimes called "BBS 半形字" (half-width characters), and the PCMan/PttBBS terminal clients handle it with a custom font that maps those lone high bytes to specific glyphs. It's very popular within Taiwanese BBS culture but has no formal name or IANA-registered codec. It's essentially Big5 + a proprietary single-byte overlay for art characters.
1 parent 4bfd0ff commit fe61e85

2 files changed

Lines changed: 339 additions & 0 deletions

File tree

telnetlib3/encodings/big5bbs.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
r"""
2+
Big5-BBS hybrid codec for Taiwanese BBS systems.
3+
4+
Traditional Taiwanese BBS systems (PttBBS, DreamBBS, etc.) send a byte stream
5+
that mixes Big5-encoded Chinese text with single-byte half-width (半形) art
6+
characters whose byte values (0xA1-0xFE) overlap with Big5 lead bytes. These
7+
lone high bytes appear immediately before ANSI escape sequences (``\x1b[...``)
8+
and cannot form valid Big5 pairs since ESC (0x1B) is not a valid Big5 second
9+
byte (which must be 0x40-0x7E or 0xA1-0xFE).
10+
11+
Decoding algorithm:
12+
13+
- When a Big5 lead byte (0xA1-0xFE) is followed by a valid Big5 second byte
14+
(0x40-0x7E or 0xA1-0xFE), the pair is decoded as Big5.
15+
- When a lead byte is followed by any other byte (e.g. ESC), the lone lead
16+
byte is decoded via CP437 and the following byte is re-processed.
17+
- Bytes below 0xA1 are decoded via latin-1 (identical to ASCII for 0x00-0x7F).
18+
"""
19+
20+
# std imports
21+
import codecs
22+
from typing import Tuple, Union
23+
24+
25+
class Codec(codecs.Codec):
26+
"""Big5-BBS stateless codec (decodes entire buffer at once with final=True)."""
27+
28+
def encode(self, input: str, errors: str = "strict") -> Tuple[bytes, int]:
29+
"""Encode string to bytes, preferring Big5 with CP437 fallback per character."""
30+
result = []
31+
for char in input:
32+
try:
33+
result.append(char.encode("big5"))
34+
except UnicodeEncodeError:
35+
encoded, _ = codecs.charmap_encode(char, errors, _CP437_ENCODING_TABLE)
36+
result.append(encoded)
37+
return b"".join(result), len(input)
38+
39+
def decode(self, input: bytes, errors: str = "strict") -> Tuple[str, int]:
40+
"""Decode bytes using Big5/CP437 hybrid algorithm."""
41+
dec = IncrementalDecoder(errors)
42+
return dec.decode(input, final=True), len(input)
43+
44+
45+
class IncrementalEncoder(codecs.IncrementalEncoder):
46+
"""Big5-BBS incremental encoder; Big5 primary, CP437 fallback."""
47+
48+
def encode(self, input: str, final: bool = False) -> bytes:
49+
"""Encode input string incrementally."""
50+
result = []
51+
for char in input:
52+
try:
53+
result.append(char.encode("big5"))
54+
except UnicodeEncodeError:
55+
encoded, _ = codecs.charmap_encode(char, self.errors, _CP437_ENCODING_TABLE)
56+
result.append(encoded)
57+
return b"".join(result)
58+
59+
def reset(self) -> None:
60+
"""Reset encoder state (stateless; no-op)."""
61+
62+
def getstate(self) -> int:
63+
"""Return encoder state."""
64+
return 0
65+
66+
def setstate(self, state: Union[int, str]) -> None:
67+
"""Restore encoder state."""
68+
69+
70+
class IncrementalDecoder(codecs.IncrementalDecoder):
71+
"""
72+
Big5-BBS incremental decoder with one-byte lookahead.
73+
74+
Holds at most one pending Big5 lead byte between calls.
75+
"""
76+
77+
def __init__(self, errors: str = "strict") -> None:
78+
"""Initialize decoder with empty pending buffer."""
79+
super().__init__(errors)
80+
self._buf: bytes = b""
81+
82+
def decode(self, input: bytes, final: bool = False) -> str: # type: ignore[override]
83+
"""Decode input bytes using Big5/CP437 hybrid algorithm."""
84+
data = self._buf + input
85+
self._buf = b""
86+
result = []
87+
i = 0
88+
while i < len(data):
89+
b = data[i]
90+
if 0xA1 <= b <= 0xFE:
91+
if i + 1 < len(data):
92+
b2 = data[i + 1]
93+
if (0x40 <= b2 <= 0x7E) or (0xA1 <= b2 <= 0xFE):
94+
result.append(bytes([b, b2]).decode("big5", errors=self.errors))
95+
i += 2
96+
else:
97+
result.append(bytes([b]).decode("cp437"))
98+
i += 1
99+
elif not final:
100+
self._buf = bytes([b])
101+
break
102+
else:
103+
result.append(bytes([b]).decode("cp437"))
104+
i += 1
105+
else:
106+
result.append(bytes([b]).decode("latin-1"))
107+
i += 1
108+
return "".join(result)
109+
110+
def reset(self) -> None:
111+
"""Reset decoder state."""
112+
self._buf = b""
113+
114+
def getstate(self) -> Tuple[bytes, int]:
115+
"""Return decoder state as (buffer, flags) tuple."""
116+
return (self._buf, 0)
117+
118+
def setstate(self, state: Tuple[bytes, int]) -> None:
119+
"""Restore decoder state from (buffer, flags) tuple."""
120+
self._buf = state[0]
121+
122+
123+
class StreamWriter(Codec, codecs.StreamWriter):
124+
"""Big5-BBS stream writer."""
125+
126+
127+
class StreamReader(Codec, codecs.StreamReader):
128+
"""Big5-BBS stream reader."""
129+
130+
131+
def getregentry() -> codecs.CodecInfo:
132+
"""Return the codec registry entry."""
133+
return codecs.CodecInfo(
134+
name="big5bbs",
135+
encode=Codec().encode,
136+
decode=Codec().decode, # type: ignore[arg-type]
137+
incrementalencoder=IncrementalEncoder,
138+
incrementaldecoder=IncrementalDecoder,
139+
streamreader=StreamReader,
140+
streamwriter=StreamWriter,
141+
)
142+
143+
144+
def getaliases() -> Tuple[str, ...]:
145+
"""Return codec aliases (normalized: hyphens replaced with underscores)."""
146+
return ("big5_bbs",)
147+
148+
149+
def _build_cp437_encoding_table() -> dict[int, int]:
150+
"""Build a Unicode ordinal → byte value map for CP437."""
151+
table: dict[int, int] = {}
152+
for byte_val in range(256):
153+
char = bytes([byte_val]).decode("cp437")
154+
table[ord(char)] = byte_val
155+
return table
156+
157+
158+
_CP437_ENCODING_TABLE = _build_cp437_encoding_table()
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
"""Tests for the Big5-BBS hybrid codec."""
2+
3+
# std imports
4+
import codecs
5+
6+
# 3rd party
7+
import pytest
8+
9+
# local
10+
import telnetlib3 # noqa: F401
11+
12+
13+
def test_codec_lookup():
14+
info = codecs.lookup("big5bbs")
15+
assert info.name == "big5bbs"
16+
17+
18+
def test_codec_alias_hyphen():
19+
codecs.lookup("big5bbs")
20+
info = codecs.lookup("big5-bbs")
21+
assert info.name == "big5bbs"
22+
23+
24+
def test_codec_alias_underscore():
25+
codecs.lookup("big5bbs")
26+
info = codecs.lookup("big5_bbs")
27+
assert info.name == "big5bbs"
28+
29+
30+
def test_ascii_passthrough():
31+
data = b"Hello, World!\n"
32+
assert data.decode("big5bbs") == "Hello, World!\n"
33+
34+
35+
def test_valid_big5_pair():
36+
# Encode a known CJK character to Big5 and verify round-trip
37+
char = "夢"
38+
big5_bytes = char.encode("big5")
39+
assert big5_bytes.decode("big5bbs") == char
40+
41+
42+
def test_valid_big5_pair_in_context():
43+
# Inline Big5 pair between ASCII text
44+
char = "夢"
45+
big5_bytes = char.encode("big5")
46+
data = b"test" + big5_bytes + b"end"
47+
assert data.decode("big5bbs") == "test" + char + "end"
48+
49+
50+
def test_lone_lead_0xa1_before_esc():
51+
# 0xA1 is a Big5 lead byte; ESC is not a valid second byte → CP437 fallback
52+
# CP437 0xA1 = í (LATIN SMALL LETTER I WITH ACUTE)
53+
data = bytes([0xA1, 0x1B, 0x5B, 0x33, 0x32, 0x6D]) # 0xA1 ESC[32m
54+
result = data.decode("big5bbs")
55+
assert result[0] == "\u00ed" # í
56+
assert result[1:] == "\x1b[32m"
57+
58+
59+
def test_lone_lead_0xb0_before_esc():
60+
# CP437 0xB0 = ░ (LIGHT SHADE, U+2591)
61+
data = bytes([0xB0, 0x1B])
62+
result = data.decode("big5bbs")
63+
assert result == "\u2591\x1b"
64+
65+
66+
def test_lone_lead_0xb6_before_esc():
67+
# CP437 0xB6 = ╢ (BOX DRAWINGS LIGHT VERTICAL AND LEFT, U+2562)
68+
data = bytes([0xB6, 0x1B])
69+
result = data.decode("big5bbs")
70+
assert result == "\u2562\x1b"
71+
72+
73+
@pytest.mark.parametrize(
74+
"byte_val,expected_unicode",
75+
[
76+
(0xA1, "\u00ed"), # í
77+
(0xA2, "\u00f3"), # ó
78+
(0xA8, "\u00bf"), # ¿
79+
(0xA9, "\u2310"), # ⌐
80+
(0xAA, "\u00ac"), # ¬
81+
(0xAB, "\u00bd"), # ½
82+
(0xB0, "\u2591"), # ░
83+
(0xB6, "\u2562"), # ╢
84+
(0xBF, "\u2510"), # ┐
85+
(0xC3, "\u251c"), # ├
86+
(0xC6, "\u255e"), # ╞
87+
(0xC7, "\u255f"), # ╟
88+
(0xCA, "\u2569"), # ╩
89+
(0xD1, "\u2564"), # ╤
90+
(0xEE, "\u03b5"), # ε
91+
(0xEF, "\u2229"), # ∩
92+
],
93+
)
94+
def test_lone_lead_bytes_cp437_fallback(byte_val, expected_unicode):
95+
# Each lone lead byte followed by ESC (invalid second byte) falls back to CP437
96+
data = bytes([byte_val, 0x1B])
97+
result = data.decode("big5bbs")
98+
assert result[0] == expected_unicode
99+
assert result[1] == "\x1b"
100+
101+
102+
def test_split_across_chunks_big5_pair():
103+
# Lead byte in chunk 1, second byte in chunk 2 → valid Big5 pair
104+
char = "夢"
105+
big5_bytes = char.encode("big5")
106+
assert len(big5_bytes) == 2
107+
decoder = codecs.getincrementaldecoder("big5bbs")()
108+
result1 = decoder.decode(big5_bytes[:1], final=False)
109+
assert result1 == "" # buffered
110+
result2 = decoder.decode(big5_bytes[1:], final=False)
111+
assert result2 == char
112+
113+
114+
def test_split_lead_byte_final_true():
115+
# Lead byte at end of stream with final=True → CP437 fallback
116+
decoder = codecs.getincrementaldecoder("big5bbs")()
117+
result = decoder.decode(bytes([0xB0]), final=True)
118+
assert result == "\u2591" # ░
119+
120+
121+
def test_split_lead_byte_not_final():
122+
# Lead byte with final=False → buffered, returns empty string
123+
decoder = codecs.getincrementaldecoder("big5bbs")()
124+
result = decoder.decode(bytes([0xB0]), final=False)
125+
assert result == ""
126+
127+
128+
def test_round_trip_big5_text():
129+
text = "夢想台灣"
130+
encoded = text.encode("big5bbs")
131+
decoded = encoded.decode("big5bbs")
132+
assert decoded == text
133+
134+
135+
def test_getstate_setstate_preserves_pending_byte():
136+
decoder = codecs.getincrementaldecoder("big5bbs")()
137+
char = "夢"
138+
big5_bytes = char.encode("big5")
139+
decoder.decode(big5_bytes[:1], final=False)
140+
state = decoder.getstate()
141+
assert state[0] == big5_bytes[:1]
142+
143+
decoder2 = codecs.getincrementaldecoder("big5bbs")()
144+
decoder2.setstate(state)
145+
result = decoder2.decode(big5_bytes[1:], final=True)
146+
assert result == char
147+
148+
149+
def test_reset_clears_pending_byte():
150+
decoder = codecs.getincrementaldecoder("big5bbs")()
151+
char = "夢"
152+
big5_bytes = char.encode("big5")
153+
decoder.decode(big5_bytes[:1], final=False)
154+
decoder.reset()
155+
state = decoder.getstate()
156+
assert state[0] == b""
157+
158+
159+
def test_mixed_stream():
160+
# Simulate a BBS art stream: Chinese text + lone art bytes + ANSI escape
161+
char = "夢"
162+
big5_bytes = char.encode("big5")
163+
data = big5_bytes + bytes([0xB0, 0x1B, 0x5B, 0x33, 0x32, 0x6D]) + b"text"
164+
result = data.decode("big5bbs")
165+
assert result == char + "\u2591\x1b[32mtext"
166+
167+
168+
def test_incremental_encoder_ascii():
169+
encoder = codecs.getincrementalencoder("big5bbs")()
170+
assert encoder.encode("Hello") == b"Hello"
171+
172+
173+
def test_incremental_encoder_big5():
174+
encoder = codecs.getincrementalencoder("big5bbs")()
175+
char = "夢"
176+
assert encoder.encode(char) == char.encode("big5")
177+
178+
179+
def test_incremental_encoder_getstate():
180+
encoder = codecs.getincrementalencoder("big5bbs")()
181+
assert encoder.getstate() == 0

0 commit comments

Comments
 (0)