Skip to content

Commit dfd8fe4

Browse files
committed
make sure load_conll2002_bio is compatible with Novelties
1 parent 01daaac commit dfd8fe4

1 file changed

Lines changed: 6 additions & 2 deletions

File tree

renard/ner_utils.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,10 @@ def load_conll2002_bio(
213213
tags = []
214214
for line in raw_data.split("\n"):
215215
line = line.strip("\n")
216-
if re.fullmatch(r"\s*", line) or (
217-
not max_sent_len is None and len(sent_tokens) >= max_sent_len
216+
if (
217+
re.fullmatch(r"\s*", line) # ignore empty lines
218+
or re.fullmatch(r"# [^:]+: .*", line) # ignore Novelties style metadata
219+
or (not max_sent_len is None and len(sent_tokens) >= max_sent_len)
218220
):
219221
if len(sent_tokens) == 0:
220222
continue
@@ -224,6 +226,8 @@ def load_conll2002_bio(
224226
token, tag = line.split(separator)
225227
sent_tokens.append(token)
226228
tags.append(tag_conversion_map.get(tag, tag))
229+
if len(sent_tokens) != 0:
230+
sents.append(sent_tokens)
227231

228232
tokens = list(flatten(sents))
229233
entities = ner_entities(tokens, tags)

0 commit comments

Comments
 (0)