change huggingface prefix from HG to HF

Aethor · Aethor · commit cba71f4bb94e · 2026-04-05T16:19:43.000+08:00
diff --git a/renard/ner_utils.py b/renard/ner_utils.py
@@ -6,7 +6,7 @@
 from more_itertools import flatten
 import torch
 from torch.utils.data import Dataset
-from datasets import Dataset as HGDataset, DatasetDict as HGDatasetDict
+from datasets import Dataset as HFDataset, DatasetDict as HFDatasetDict
 from datasets import Sequence, ClassLabel
 from transformers import (
     AutoModelForTokenClassification,
@@ -235,14 +235,14 @@ def load_conll2002_bio(
     return sents, list(flatten(sents)), entities
 
 
-def hgdataset_from_conll2002(
+def hfdataset_from_conll2002(
     path: str,
     tag_conversion_map: Optional[Dict[str, str]] = None,
     separator: str = "\t",
     max_sent_len: Optional[int] = None,
     labels: Optional[list[str]] = None,
     **kwargs,
-) -> HGDataset:
+) -> HFDataset:
     """Load a CoNLL-2002 file as a Huggingface Dataset.
 
     :param path: passed to :func:`.load_conll2002_bio`
@@ -277,13 +277,21 @@ def hgdataset_from_conll2002(
         for sent_start, sent_end in zip(sent_starts, sent_ends)
     ]
 
-    dataset = HGDataset.from_dict({"tokens": sentences, "labels": sent_tags})
+    dataset = HFDataset.from_dict({"tokens": sentences, "labels": sent_tags})
     if labels is None:
         labels = sorted(set(tags))
     dataset = dataset.cast_column("labels", Sequence(ClassLabel(names=labels)))
     return dataset
 
 
+def hgdataset_from_conll2002(**kwargs) -> HFDataset:
+    """
+    Deprecated function that only exists for retrocompatibility, you
+    should call :func:`.hfdataset_from_conll2002` instead.
+    """
+    return hfdataset_from_conll2002(**kwargs)
+
+
 def _tokenize_and_align_labels(
     examples, tokenizer: PreTrainedTokenizerFast, label_all_tokens: bool = True
 ):
@@ -324,15 +332,15 @@ def _tokenize_and_align_labels(
 
 
 def train_ner_model(
-    hg_id: str,
-    dataset: Union[HGDataset, HGDatasetDict],
+    hf_id: str,
+    dataset: Union[HFDataset, HFDatasetDict],
     targs: TrainingArguments,
     train_split: str = "train",
     valid_split: str = "valid",
 ) -> PreTrainedModel:
     """Train a NER model on the given dataset.
 
-    :param hg_id: huggingface ID of the model to train
+    :param hf_id: huggingface ID of the model to train
     :param dataset: huggingface dataset on which to train.  The
         'labels' column is assumed to contain NER labels.
     :param TrainingArguments: training arguments for the huggingface
@@ -345,14 +353,14 @@ def train_ner_model(
     # BERT tokenizer splits tokens into subtokens. The
     # tokenize_and_align_labels function correctly aligns labels and
     # subtokens.
-    tokenizer = AutoTokenizer.from_pretrained(hg_id)
+    tokenizer = AutoTokenizer.from_pretrained(hf_id)
     dataset = dataset.map(
         ft.partial(_tokenize_and_align_labels, tokenizer=tokenizer), batched=True
     )
 
     label_lst = dataset[train_split].features["labels"].feature.names
     model = AutoModelForTokenClassification.from_pretrained(
-        hg_id,
+        hf_id,
         num_labels=len(label_lst),
         id2label={i: label for i, label in enumerate(label_lst)},
         label2id={label: i for i, label in enumerate(label_lst)},
diff --git a/renard/pipeline/relation_extraction.py b/renard/pipeline/relation_extraction.py
@@ -1,7 +1,7 @@
 from typing import Any, Union, Optional, Literal
 import ast, re
 import functools as ft
-from datasets import load_dataset, Dataset as HGDataset
+from datasets import load_dataset, Dataset as HFDataset
 import torch
 from transformers import (
     AutoModelForSeq2SeqLM,
@@ -12,7 +12,7 @@
     DataCollatorForSeq2Seq,
     PreTrainedModel,
     EvalPrediction,
-    pipeline as hg_pipeline,
+    pipeline as hf_pipeline,
     BatchEncoding,
 )
 from more_itertools import flatten
@@ -46,7 +46,7 @@ def format_rel(rel: dict) -> str:
     return batch
 
 
-def load_ARF_dataset(tokenizer: PreTrainedTokenizerFast) -> HGDataset:
+def load_ARF_dataset(tokenizer: PreTrainedTokenizerFast) -> HFDataset:
     """
     Load the Artificial Relationships in Fiction dataset
     (https://huggingface.co/datasets/Despina/project_gutenberg) by
@@ -154,7 +154,7 @@ def __init__(
         self.model = (
             GenerativeRelationExtractor.DEFAULT_MODEL if model is None else model
         )
-        self.hg_pipeline = None
+        self.hf_pipeline = None
         self.batch_size = batch_size
         if device == "auto":
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -163,7 +163,7 @@ def __init__(
 
     def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter, **kwargs):
         super()._pipeline_init_(lang, progress_reporter, **kwargs)
-        self.hg_pipeline = hg_pipeline(
+        self.hf_pipeline = hf_pipeline(
             "text2text-generation",
             torch_dtype=torch.bfloat16,
             model=self.model,
@@ -173,19 +173,19 @@ def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter, **kwar
     def __call__(
         self, sentences: list[list[str]], characters: list[Character], **kwargs
     ) -> dict[str, Any]:
-        assert not self.hg_pipeline is None
+        assert not self.hf_pipeline is None
 
         sentence_relations = []
 
         # chunk as in the ARF dataset
-        dataset = HGDataset.from_list(
+        dataset = HFDataset.from_list(
             [
                 {"text": GenerativeRelationExtractor.task_prompt(" ".join(sent))}
                 for sent in sentences
             ]
         )
         for out in self._progress_(
-            self.hg_pipeline(KeyDataset(dataset, "text"), batch_size=self.batch_size),
+            self.hf_pipeline(KeyDataset(dataset, "text"), batch_size=self.batch_size),
             total=len(dataset),
         ):
             text_relations = out[0]["generated_text"]