Skip to content

Commit 6962b51

Browse files
committed
move default encoding to constants file, use it as fallback for add(), fix #67
also add ability to specify encoding of values added to the constants using add_with_encoding()
1 parent b7aded4 commit 6962b51

3 files changed

Lines changed: 31 additions & 13 deletions

File tree

nameparser/config/__init__.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
from nameparser.config.titles import FIRST_NAME_TITLES
4444
from nameparser.config.regexes import REGEXES
4545

46+
DEFAULT_ENCODING = 'UTF-8'
47+
4648
class SetManager(collections.Set):
4749
'''
4850
Easily add and remove config variables per module or instance. Subclass of
@@ -84,15 +86,23 @@ def __next__(self):
8486
self.count = c + 1
8587
return getattr(self, self.elements[c]) or next(self)
8688

89+
def add_with_encoding(self, s, encoding=None):
90+
"""
91+
Add the lower case and no-period version of the string to the set. Pass an
92+
explicit `encoding` parameter to specify the encoding of binary strings that
93+
are not DEFAULT_ENCODING (UTF-8).
94+
"""
95+
encoding = encoding or sys.stdin.encoding or DEFAULT_ENCODING
96+
if type(s) == binary_type:
97+
s = s.decode(encoding)
98+
self.elements.add(lc(s))
99+
87100
def add(self, *strings):
88101
"""
89102
Add the lower case and no-period version of the string arguments to the set.
90-
Returns ``self`` for chaining.
103+
Can pass a list of strings. Returns ``self`` for chaining.
91104
"""
92-
for s in strings:
93-
if type(s) == binary_type:
94-
s = s.decode(sys.stdin.encoding)
95-
self.elements.add(lc(s))
105+
[self.add_with_encoding(s) for s in strings]
96106
return self
97107

98108
def remove(self, *strings):
@@ -193,7 +203,7 @@ def suffixes_prefixes_titles(self):
193203
if not self._pst:
194204
self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles
195205
return self._pst
196-
206+
197207
def __repr__(self):
198208
return "<Constants() instance>"
199209

nameparser/parser.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from nameparser.util import log
1212
from nameparser.config import CONSTANTS
1313
from nameparser.config import Constants
14+
from nameparser.config import DEFAULT_ENCODING
1415

1516
ENCODING = 'utf-8'
1617

@@ -69,13 +70,13 @@ class HumanName(object):
6970
unparsable = True
7071
_full_name = ''
7172

72-
def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING,
73+
def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING,
7374
string_format=None):
7475
self.C = constants
7576
if type(self.C) is not type(CONSTANTS):
7677
self.C = Constants()
7778

78-
self.ENCODING = encoding
79+
self.encoding = encoding
7980
self.string_format = string_format or self.C.string_format
8081
# full_name setter triggers the parse
8182
self.full_name = full_name
@@ -127,15 +128,15 @@ def __unicode__(self):
127128
if self.string_format:
128129
# string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
129130
_s = self.string_format.format(**self.as_dict())
130-
# remove trailing punctation from missing nicknames
131+
# remove trailing punctuation from missing nicknames
131132
_s = _s.replace(str(self.C.empty_attribute_default),'').replace(" ()","").replace(" ''","").replace(' ""',"")
132133
return self.collapse_whitespace(_s).strip(', ')
133134
return " ".join(self)
134135

135136
def __str__(self):
136137
if sys.version >= '3':
137138
return self.__unicode__()
138-
return self.__unicode__().encode(self.ENCODING)
139+
return self.__unicode__().encode(self.encoding)
139140

140141
def __repr__(self):
141142
if self.unparsable:
@@ -152,7 +153,7 @@ def __repr__(self):
152153
}
153154
if sys.version >= '3':
154155
return _string
155-
return _string.encode(self.ENCODING)
156+
return _string.encode(self.encoding)
156157

157158
def as_dict(self, include_empty=True):
158159
"""
@@ -355,7 +356,7 @@ def full_name(self, value):
355356
self.original = value
356357
self._full_name = value
357358
if isinstance(value, binary_type):
358-
self._full_name = value.decode(self.ENCODING)
359+
self._full_name = value.decode(self.encoding)
359360
self.parse_full_name()
360361

361362
def collapse_whitespace(self, string):

tests.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ def test_blank_name(self):
189189
self.m(hn.first, "", hn)
190190
self.m(hn.last, "", hn)
191191

192+
192193
class FirstNameHandlingTests(HumanNameTestBase):
193194
def test_first_name(self):
194195
hn = HumanName("Andrew")
@@ -1058,7 +1059,6 @@ def test119(self):
10581059
self.m(hn.last, "Almighty", hn)
10591060

10601061

1061-
10621062
class HumanNameConjunctionTestCase(HumanNameTestBase):
10631063
# Last name with conjunction
10641064
def test_last_name_with_conjunction(self):
@@ -1244,6 +1244,7 @@ def test_conjunction_in_an_address_with_a_first_name_title(self):
12441244
# if you want to be technical, Queen is in FIRST_NAME_TITLES
12451245
self.m(hn.first, "Elizabeth", hn)
12461246

1247+
12471248
class ConstantsCustomization(HumanNameTestBase):
12481249

12491250
def test_add_title(self):
@@ -1335,6 +1336,12 @@ def test_none_empty_attribute_string_formatting(self):
13351336
hn.C.empty_attribute_default = None
13361337
self.assertEqual('', str(hn), hn)
13371338

1339+
def test_add_constant_with_explicit_encoding(self):
1340+
c = Constants()
1341+
c.titles.add_with_encoding(b'b\351ck', encoding='latin_1')
1342+
self.assertIn('béck', c.titles)
1343+
1344+
13381345
class HumanNameNicknameTestCase(HumanNameTestBase):
13391346
# https://code.google.com/p/python-nameparser/issues/detail?id=33
13401347
def test_nickname_in_parenthesis(self):

0 commit comments

Comments
 (0)