Skip to content

Commit 4b14fc5

Browse files
pjkundertmatejcik
authored andcommitted
Support unambiguous detection of language if only prefixes are supplied
o ceases search as soon as ambiguity is resolved Simplify success exit criteria for detecting language Correct language deduction if prefixes remain ambiguous
1 parent 264145f commit 4b14fc5

2 files changed

Lines changed: 46 additions & 3 deletions

File tree

src/mnemonic/mnemonic.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,15 +93,36 @@ def normalize_string(txt: t.AnyStr) -> str:
9393

9494
@classmethod
9595
def detect_language(cls, code: str) -> str:
96-
"""Scan the Mnemonic until the language becomes unambiguous."""
96+
"""Scan the Mnemonic until the language becomes unambiguous, including as abbreviation prefixes.
97+
98+
Unfortunately, there are valid words that are ambiguous between languages, which are complete words
99+
in one language and are prefixes in another:
100+
101+
english: abandon ... about
102+
french: abandon ... aboutir
103+
104+
If prefixes remain ambiguous, require exactly one language where word(s) match exactly.
105+
"""
97106
code = cls.normalize_string(code)
98107
possible = set(cls(lang) for lang in cls.list_languages())
99-
for word in code.split():
100-
possible = set(p for p in possible if word in p.wordlist)
108+
words = set(code.split())
109+
for word in words:
110+
# possible languages have candidate(s) starting with the word/prefix
111+
possible = set(
112+
p for p in possible if any(c.startswith(word) for c in p.wordlist)
113+
)
101114
if not possible:
102115
raise ConfigurationError(f"Language unrecognized for {word!r}")
103116
if len(possible) == 1:
104117
return possible.pop().language
118+
# Multiple languages match: A prefix in many, but an exact match in one determines language.
119+
complete = set()
120+
for word in words:
121+
exact = set(p for p in possible if word in p.wordlist)
122+
if len(exact) == 1:
123+
complete.update(exact)
124+
if len(complete) == 1:
125+
return complete.pop().language
105126
raise ConfigurationError(
106127
f"Language ambiguous between {', '.join(p.language for p in possible)}"
107128
)

tests/test_mnemonic.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,16 @@ def test_failed_checksum(self) -> None:
5757
def test_detection(self) -> None:
5858
self.assertEqual("english", Mnemonic.detect_language("security"))
5959

60+
self.assertEqual(
61+
"english", Mnemonic.detect_language("fruit wave dwarf")
62+
) # ambiguous up to wave
63+
self.assertEqual(
64+
"english", Mnemonic.detect_language("fru wago dw")
65+
) # ambiguous french/english up to dwarf prefix
66+
self.assertEqual(
67+
"french", Mnemonic.detect_language("fru wago dur enje")
68+
) # ambiguous french/english up to enjeu prefix
69+
6070
with self.assertRaises(Exception):
6171
Mnemonic.detect_language(
6272
"jaguar xxxxxxx"
@@ -67,8 +77,20 @@ def test_detection(self) -> None:
6777
"jaguar jaguar"
6878
) # Ambiguous after examining all words
6979

80+
# Allowing word prefixes in language detection presents ambiguity issues. Require exactly
81+
# one language that matches all prefixes, or one language matching some word(s) exactly.
7082
self.assertEqual("english", Mnemonic.detect_language("jaguar security"))
7183
self.assertEqual("french", Mnemonic.detect_language("jaguar aboyer"))
84+
self.assertEqual("english", Mnemonic.detect_language("abandon about"))
85+
self.assertEqual("french", Mnemonic.detect_language("abandon aboutir"))
86+
self.assertEqual("french", Mnemonic.detect_language("fav financer"))
87+
self.assertEqual("czech", Mnemonic.detect_language("fav finance"))
88+
with self.assertRaises(Exception):
89+
Mnemonic.detect_language("favor finan")
90+
self.assertEqual("czech", Mnemonic.detect_language("flanel"))
91+
self.assertEqual("portuguese", Mnemonic.detect_language("flanela"))
92+
with self.assertRaises(Exception):
93+
Mnemonic.detect_language("flane")
7294

7395
def test_utf8_nfkd(self) -> None:
7496
# The same sentence in various UTF-8 forms

0 commit comments

Comments
 (0)