File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -213,8 +213,10 @@ def load_conll2002_bio(
213213 tags = []
214214 for line in raw_data .split ("\n " ):
215215 line = line .strip ("\n " )
216- if re .fullmatch (r"\s*" , line ) or (
217- not max_sent_len is None and len (sent_tokens ) >= max_sent_len
216+ if (
217+ re .fullmatch (r"\s*" , line ) # ignore empty lines
218+ or re .fullmatch (r"# [^:]+: .*" , line ) # ignore Novelties style metadata
219+ or (not max_sent_len is None and len (sent_tokens ) >= max_sent_len )
218220 ):
219221 if len (sent_tokens ) == 0 :
220222 continue
@@ -224,6 +226,8 @@ def load_conll2002_bio(
224226 token , tag = line .split (separator )
225227 sent_tokens .append (token )
226228 tags .append (tag_conversion_map .get (tag , tag ))
229+ if len (sent_tokens ) != 0 :
230+ sents .append (sent_tokens )
227231
228232 tokens = list (flatten (sents ))
229233 entities = ner_entities (tokens , tags )
You can’t perform that action at this time.
0 commit comments