66from more_itertools import flatten
77import torch
88from torch .utils .data import Dataset
9- from datasets import Dataset as HGDataset , DatasetDict as HGDatasetDict
9+ from datasets import Dataset as HFDataset , DatasetDict as HFDatasetDict
1010from datasets import Sequence , ClassLabel
1111from transformers import (
1212 AutoModelForTokenClassification ,
@@ -235,14 +235,14 @@ def load_conll2002_bio(
235235 return sents , list (flatten (sents )), entities
236236
237237
238- def hgdataset_from_conll2002 (
238+ def hfdataset_from_conll2002 (
239239 path : str ,
240240 tag_conversion_map : Optional [Dict [str , str ]] = None ,
241241 separator : str = "\t " ,
242242 max_sent_len : Optional [int ] = None ,
243243 labels : Optional [list [str ]] = None ,
244244 ** kwargs ,
245- ) -> HGDataset :
245+ ) -> HFDataset :
246246 """Load a CoNLL-2002 file as a Huggingface Dataset.
247247
248248 :param path: passed to :func:`.load_conll2002_bio`
@@ -277,13 +277,21 @@ def hgdataset_from_conll2002(
277277 for sent_start , sent_end in zip (sent_starts , sent_ends )
278278 ]
279279
280- dataset = HGDataset .from_dict ({"tokens" : sentences , "labels" : sent_tags })
280+ dataset = HFDataset .from_dict ({"tokens" : sentences , "labels" : sent_tags })
281281 if labels is None :
282282 labels = sorted (set (tags ))
283283 dataset = dataset .cast_column ("labels" , Sequence (ClassLabel (names = labels )))
284284 return dataset
285285
286286
287+ def hgdataset_from_conll2002 (** kwargs ) -> HFDataset :
288+ """
289+ Deprecated function that only exists for retrocompatibility, you
290+ should call :func:`.hfdataset_from_conll2002` instead.
291+ """
292+ return hfdataset_from_conll2002 (** kwargs )
293+
294+
287295def _tokenize_and_align_labels (
288296 examples , tokenizer : PreTrainedTokenizerFast , label_all_tokens : bool = True
289297):
@@ -324,15 +332,15 @@ def _tokenize_and_align_labels(
324332
325333
326334def train_ner_model (
327- hg_id : str ,
328- dataset : Union [HGDataset , HGDatasetDict ],
335+ hf_id : str ,
336+ dataset : Union [HFDataset , HFDatasetDict ],
329337 targs : TrainingArguments ,
330338 train_split : str = "train" ,
331339 valid_split : str = "valid" ,
332340) -> PreTrainedModel :
333341 """Train a NER model on the given dataset.
334342
335- :param hg_id : huggingface ID of the model to train
343+ :param hf_id : huggingface ID of the model to train
336344 :param dataset: huggingface dataset on which to train. The
337345 'labels' column is assumed to contain NER labels.
338346 :param TrainingArguments: training arguments for the huggingface
@@ -345,14 +353,14 @@ def train_ner_model(
345353 # BERT tokenizer splits tokens into subtokens. The
346354 # tokenize_and_align_labels function correctly aligns labels and
347355 # subtokens.
348- tokenizer = AutoTokenizer .from_pretrained (hg_id )
356+ tokenizer = AutoTokenizer .from_pretrained (hf_id )
349357 dataset = dataset .map (
350358 ft .partial (_tokenize_and_align_labels , tokenizer = tokenizer ), batched = True
351359 )
352360
353361 label_lst = dataset [train_split ].features ["labels" ].feature .names
354362 model = AutoModelForTokenClassification .from_pretrained (
355- hg_id ,
363+ hf_id ,
356364 num_labels = len (label_lst ),
357365 id2label = {i : label for i , label in enumerate (label_lst )},
358366 label2id = {label : i for i , label in enumerate (label_lst )},
0 commit comments