fix torch.cuda.amp deperecated error

KumoLiu · KumoLiu · commit 31af40aa5627 · 2025-03-17T20:51:31.000+08:00
Signed-off-by: YunLiu &lt;55491388+KumoLiu@users.noreply.github.com&gt;
diff --git a/3d_registration/learn2reg_nlst_paired_lung_ct.ipynb b/3d_registration/learn2reg_nlst_paired_lung_ct.ipynb
@@ -872,7 +872,7 @@
    "source": [
     "# Automatic mixed precision (AMP) for faster training\n",
     "amp_enabled = True\n",
-    "scaler = torch.cuda.amp.GradScaler()\n",
+    "scaler = torch.GradScaler(\"cuda\")\n",
     "\n",
     "# Tensorboard\n",
     "if do_save:\n",
@@ -1139,7 +1139,7 @@
     "# Forward pass\n",
     "model.eval()\n",
     "with torch.no_grad():\n",
-    "    with torch.cuda.amp.autocast(enabled=amp_enabled):\n",
+    "    with torch.autocast(\"cuda\", enabled=amp_enabled):\n",
     "        ddf_image, ddf_keypoints, pred_image, pred_label = forward(\n",
     "            check_data[\"fixed_image\"].to(device),\n",
     "            check_data[\"moving_image\"].to(device),\n",
diff --git a/3d_registration/learn2reg_oasis_unpaired_brain_mr.ipynb b/3d_registration/learn2reg_oasis_unpaired_brain_mr.ipynb
@@ -610,7 +610,7 @@
    "source": [
     "# Automatic mixed precision (AMP) for faster training\n",
     "amp_enabled = True\n",
-    "scaler = torch.cuda.amp.GradScaler()\n",
+    "scaler = torch.GradScaler(\"cuda\")\n",
     "\n",
     "# Tensorboard\n",
     "if do_save:\n",
@@ -646,7 +646,7 @@
     "\n",
     "        # Forward pass and loss\n",
     "        optimizer.zero_grad()\n",
-    "        with torch.cuda.amp.autocast(enabled=amp_enabled):\n",
+    "        with torch.autocast(\"cuda\", enabled=amp_enabled):\n",
     "            ddf_image, pred_image, pred_label_one_hot = forward(\n",
     "                fixed_image, moving_image, moving_label, model, warp_layer, num_classes=4\n",
     "            )\n",
@@ -694,7 +694,7 @@
     "                # moving_label_35 = batch_data[\"moving_label_35\"].to(device)\n",
     "                n_steps += 1\n",
     "                # Infer\n",
-    "                with torch.cuda.amp.autocast(enabled=amp_enabled):\n",
+    "                with torch.autocast(\"cuda\", enabled=amp_enabled):\n",
     "                    ddf_image, pred_image, pred_label_one_hot = forward(\n",
     "                        fixed_image, moving_image, moving_label_4, model, warp_layer, num_classes=4\n",
     "                    )\n",
@@ -860,7 +860,7 @@
     "# Forward pass\n",
     "model.eval()\n",
     "with torch.no_grad():\n",
-    "    with torch.cuda.amp.autocast(enabled=amp_enabled):\n",
+    "    with torch.autocast(\"cuda\", enabled=amp_enabled):\n",
     "        ddf_image, pred_image, pred_label_one_hot = forward(\n",
     "            fixed_image, moving_image, moving_label_35, model, warp_layer, num_classes=35\n",
     "        )"
diff --git a/3d_segmentation/brats_segmentation_3d.ipynb b/3d_segmentation/brats_segmentation_3d.ipynb
@@ -473,14 +473,14 @@
     "        )\n",
     "\n",
     "    if VAL_AMP:\n",
-    "        with torch.cuda.amp.autocast():\n",
+    "        with torch.autocast(\"cuda\"):\n",
     "            return _compute(input)\n",
     "    else:\n",
     "        return _compute(input)\n",
     "\n",
     "\n",
     "# use amp to accelerate training\n",
-    "scaler = torch.cuda.amp.GradScaler()\n",
+    "scaler = torch.GradScaler(\"cuda\")\n",
     "# enable cuDNN benchmark\n",
     "torch.backends.cudnn.benchmark = True"
    ]
@@ -526,7 +526,7 @@
     "            batch_data[\"label\"].to(device),\n",
     "        )\n",
     "        optimizer.zero_grad()\n",
-    "        with torch.cuda.amp.autocast():\n",
+    "        with torch.autocast(\"cuda\"):\n",
     "            outputs = model(inputs)\n",
     "            loss = loss_function(outputs, labels)\n",
     "        scaler.scale(loss).backward()\n",
@@ -924,7 +924,7 @@
     "        )\n",
     "\n",
     "    if VAL_AMP:\n",
-    "        with torch.cuda.amp.autocast():\n",
+    "        with torch.autocast(\"cuda\"):\n",
     "            return _compute(input)\n",
     "    else:\n",
     "        return _compute(input)"
diff --git a/3d_segmentation/swin_unetr_btcv_segmentation_3d.ipynb b/3d_segmentation/swin_unetr_btcv_segmentation_3d.ipynb
@@ -493,7 +493,7 @@
     "torch.backends.cudnn.benchmark = True\n",
     "loss_function = DiceCELoss(to_onehot_y=True, softmax=True)\n",
     "optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)\n",
-    "scaler = torch.cuda.amp.GradScaler()"
+    "scaler = torch.GradScaler(\"cuda\")"
    ]
   },
   {
@@ -516,7 +516,7 @@
     "    with torch.no_grad():\n",
     "        for batch in epoch_iterator_val:\n",
     "            val_inputs, val_labels = (batch[\"image\"].cuda(), batch[\"label\"].cuda())\n",
-    "            with torch.cuda.amp.autocast():\n",
+    "            with torch.autocast(\"cuda\"):\n",
     "                val_outputs = sliding_window_inference(val_inputs, (96, 96, 96), 4, model)\n",
     "            val_labels_list = decollate_batch(val_labels)\n",
     "            val_labels_convert = [post_label(val_label_tensor) for val_label_tensor in val_labels_list]\n",
@@ -537,7 +537,7 @@
     "    for step, batch in enumerate(epoch_iterator):\n",
     "        step += 1\n",
     "        x, y = (batch[\"image\"].cuda(), batch[\"label\"].cuda())\n",
-    "        with torch.cuda.amp.autocast():\n",
+    "        with torch.autocast(\"cuda\"):\n",
     "            logit_map = model(x)\n",
     "            loss = loss_function(logit_map, y)\n",
     "        scaler.scale(loss).backward()\n",
diff --git a/acceleration/automatic_mixed_precision.ipynb b/acceleration/automatic_mixed_precision.ipynb
@@ -289,7 +289,7 @@
     "    ).to(device)\n",
     "    loss_function = DiceLoss(to_onehot_y=True, softmax=True)\n",
     "    optimizer = torch.optim.Adam(model.parameters(), 1e-4)\n",
-    "    scaler = torch.cuda.amp.GradScaler() if amp else None\n",
+    "    scaler = torch.GradScaler(\"cuda\") if amp else None\n",
     "\n",
     "    post_pred = Compose([AsDiscrete(argmax=True, to_onehot=2)])\n",
     "    post_label = Compose([AsDiscrete(to_onehot=2)])\n",
@@ -321,7 +321,7 @@
     "            )\n",
     "            optimizer.zero_grad()\n",
     "            if amp and scaler is not None:\n",
-    "                with torch.cuda.amp.autocast():\n",
+    "                with torch.autocast(\"cuda\"):\n",
     "                    outputs = model(inputs)\n",
     "                    loss = loss_function(outputs, labels)\n",
     "                scaler.scale(loss).backward()\n",
@@ -353,7 +353,7 @@
     "                    roi_size = (160, 160, 128)\n",
     "                    sw_batch_size = 4\n",
     "                    if amp:\n",
-    "                        with torch.cuda.amp.autocast():\n",
+    "                        with torch.autocast(\"cuda\"):\n",
     "                            val_outputs = sliding_window_inference(val_inputs, roi_size, sw_batch_size, model)\n",
     "                    else:\n",
     "                        val_outputs = sliding_window_inference(val_inputs, roi_size, sw_batch_size, model)\n",
diff --git a/acceleration/distributed_training/brats_training_ddp.py b/acceleration/distributed_training/brats_training_ddp.py
@@ -170,7 +170,7 @@ def main_worker(args):
     device = torch.device(f"cuda:{os.environ['LOCAL_RANK']}")
     torch.cuda.set_device(device)
     # use amp to accelerate training
-    scaler = torch.cuda.amp.GradScaler()
+    scaler = torch.GradScaler("cuda")
     torch.backends.cudnn.benchmark = True
 
     total_start = time.time()
@@ -320,7 +320,7 @@ def train(train_loader, model, criterion, optimizer, lr_scheduler, scaler):
     for batch_data in train_loader:
         step += 1
         optimizer.zero_grad()
-        with torch.cuda.amp.autocast():
+        with torch.autocast("cuda"):
             outputs = model(batch_data["image"])
             loss = criterion(outputs, batch_data["label"])
         scaler.scale(loss).backward()
@@ -339,7 +339,7 @@ def evaluate(model, val_loader, dice_metric, dice_metric_batch, post_trans):
     model.eval()
     with torch.no_grad():
         for val_data in val_loader:
-            with torch.cuda.amp.autocast():
+            with torch.autocast("cuda"):
                 val_outputs = sliding_window_inference(
                     inputs=val_data["image"], roi_size=(240, 240, 160), sw_batch_size=4, predictor=model, overlap=0.6
                 )
diff --git a/acceleration/fast_model_training_guide.md b/acceleration/fast_model_training_guide.md
@@ -120,7 +120,7 @@ nvtx.end_range(rng_train_dataload)
 optimizer.zero_grad()
 
 rng_train_forward = nvtx.start_range(message="forward", color="green")
-with torch.cuda.amp.autocast():
+with torch.autocast("cuda"):
     outputs = model(inputs)
     loss = loss_function(outputs, labels)
 nvtx.end_range(rng_train_forward)
@@ -231,7 +231,7 @@ NVIDIA GPUs have been widely applied in many areas of deep learning training and
 
 In 2017, NVIDIA researchers developed a methodology for mixed-precision training, which combined single-precision (FP32) with half-precision (e.g., FP16) format when training a network, and it achieved a similar accuracy as FP32 training using the same hyperparameters.
 
-For the PyTorch 1.6 release, developers at NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, `torch.cuda.amp`.
+For the PyTorch 1.6 release, developers at NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, `torch.autocast`.
 
 MONAI workflows can easily set `amp=True/False` in `SupervisedTrainer` or `SupervisedEvaluator` during training or evaluation to enable/disable AMP.
 We tried to compare the training speed of the spleen segmentation task if AMP ON/OFF on NVIDIA A100 GPU with CUDA 11 and obtained some benchmark results:
diff --git a/acceleration/fast_training_tutorial.ipynb b/acceleration/fast_training_tutorial.ipynb
@@ -486,7 +486,7 @@
     "            momentum=0.9,\n",
     "            weight_decay=0.00004,\n",
     "        )\n",
-    "        scaler = torch.cuda.amp.GradScaler()\n",
+    "        scaler = torch.GradScaler(\"cuda\")\n",
     "    else:\n",
     "        optimizer = Adam(model.parameters(), learning_rate)\n",
     "\n",
@@ -528,7 +528,7 @@
     "                if fast:\n",
     "                    # profiling: forward\n",
     "                    with nvtx.annotate(\"forward\", color=\"green\") if profiling else no_profiling:\n",
-    "                        with torch.cuda.amp.autocast():\n",
+    "                        with torch.autocast(\"cuda\"):\n",
     "                            outputs = model(inputs)\n",
     "                            loss = loss_function(outputs, labels)\n",
     "\n",
@@ -584,7 +584,7 @@
     "                        with nvtx.annotate(\"sliding window\", color=\"green\") if profiling else no_profiling:\n",
     "                            # set AMP for MONAI validation\n",
     "                            if fast:\n",
-    "                                with torch.cuda.amp.autocast():\n",
+    "                                with torch.autocast(\"cuda\"):\n",
     "                                    val_outputs = sliding_window_inference(val_inputs, roi_size, sw_batch_size, model)\n",
     "                            else:\n",
     "                                val_outputs = sliding_window_inference(val_inputs, roi_size, sw_batch_size, model)\n",
diff --git a/auto3dseg/docs/ensemble.md b/auto3dseg/docs/ensemble.md
@@ -55,7 +55,7 @@ class InferClass:
         batch_data = list_data_collate([batch_data])
         infer_image = batch_data["image"].to(self.device)
 
-        with torch.cuda.amp.autocast():
+        with torch.autocast("cuda"):
             batch_data["pred"] = sliding_window_inference(
                 infer_image,
                 self.patch_size_valid,
diff --git a/automl/DiNTS/search_dints.py b/automl/DiNTS/search_dints.py
@@ -429,7 +429,7 @@ def main():
 
     # amp
     if amp:
-        from torch.cuda.amp import autocast, GradScaler
+        from torch import autocast, GradScaler
 
         scaler = GradScaler()
         if dist.get_rank() == 0:
@@ -638,7 +638,7 @@ def main():
                     sw_batch_size = num_sw_batch_size
 
                     if amp:
-                        with torch.cuda.amp.autocast():
+                        with torch.autocast("cuda"):
                             pred = sliding_window_inference(
                                 val_images,
                                 roi_size,
diff --git a/automl/DiNTS/train_dints.py b/automl/DiNTS/train_dints.py
@@ -406,7 +406,7 @@ def main():
 
     # amp
     if amp:
-        from torch.cuda.amp import autocast, GradScaler
+        from torch import autocast, GradScaler
 
         scaler = GradScaler()
         if dist.get_rank() == 0:
@@ -511,7 +511,7 @@ def main():
 
                     # test time augmentation
                     ct = 1.0
-                    with torch.cuda.amp.autocast():
+                    with torch.autocast("cuda"):
                         pred = sliding_window_inference(
                             val_images,
                             roi_size,
diff --git a/competitions/MICCAI/surgtoolloc/classification_files/train.py b/competitions/MICCAI/surgtoolloc/classification_files/train.py
@@ -21,7 +21,7 @@
 from monai.bundle import ConfigParser
 from monai.metrics import ConfusionMatrixMetric
 from monai.networks.nets import EfficientNetBN
-from torch.cuda.amp import GradScaler, autocast
+from torch import GradScaler, autocast
 from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
 from utils import (
diff --git a/competitions/kaggle/RANZCR/4th_place_solution/train.py b/competitions/kaggle/RANZCR/4th_place_solution/train.py
@@ -22,7 +22,7 @@
 from monai.metrics import compute_roc_auc
 from monai.transforms import ToDeviced
 from scipy.special import expit
-from torch.cuda.amp import GradScaler, autocast
+from torch import GradScaler, autocast
 from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
 
diff --git a/detection/luna16_testing.py b/detection/luna16_testing.py
@@ -154,7 +154,7 @@ def main():
             inference_inputs = [inference_data_i["image"].to(device) for inference_data_i in inference_data]
 
             if amp:
-                with torch.cuda.amp.autocast():
+                with torch.autocast("cuda"):
                     inference_outputs = detector(inference_inputs, use_inferer=use_inferer)
             else:
                 inference_outputs = detector(inference_inputs, use_inferer=use_inferer)
diff --git a/detection/luna16_training.py b/detection/luna16_training.py
@@ -337,7 +337,7 @@ def main():
                     val_inputs = [val_data_i.pop("image").to(device) for val_data_i in val_data]
 
                     if amp:
-                        with torch.cuda.amp.autocast():
+                        with torch.autocast("cuda"):
                             val_outputs = detector(val_inputs, use_inferer=use_inferer)
                     else:
                         val_outputs = detector(val_inputs, use_inferer=use_inferer)
diff --git a/generation/image_to_image_translation/tutorial_segmentation_with_ddpm.ipynb b/generation/image_to_image_translation/tutorial_segmentation_with_ddpm.ipynb
@@ -78,7 +78,7 @@
     "from monai.config import print_config\n",
     "from monai.data import DataLoader\n",
     "from monai.utils import set_determinism\n",
-    "from torch.cuda.amp import GradScaler, autocast\n",
+    "from torch import GradScaler, autocast\n",
     "from tqdm import tqdm\n",
     "\n",
     "from monai.inferers import DiffusionInferer\n",
diff --git a/modules/dynunet_pipeline/evaluator.py b/modules/dynunet_pipeline/evaluator.py
@@ -149,7 +149,7 @@ def _compute_pred():
         # execute forward computation
         with eval_mode(self.network):
             if self.amp:
-                with torch.cuda.amp.autocast():
+                with torch.autocast("cuda"):
                     predictions = _compute_pred()
             else:
                 predictions = _compute_pred()
diff --git a/modules/dynunet_pipeline/inferrer.py b/modules/dynunet_pipeline/inferrer.py
@@ -121,7 +121,7 @@ def _compute_pred():
         # execute forward computation
         with eval_mode(self.network):
             if self.amp:
-                with torch.cuda.amp.autocast():
+                with torch.autocast("cuda"):
                     predictions = _compute_pred()
             else:
                 predictions = _compute_pred()
diff --git a/modules/dynunet_pipeline/trainer.py b/modules/dynunet_pipeline/trainer.py
@@ -71,7 +71,7 @@ def _compute_pred_loss():
         self.network.train()
         self.optimizer.zero_grad()
         if self.amp and self.scaler is not None:
-            with torch.cuda.amp.autocast():
+            with torch.autocast("cuda"):
                 _compute_pred_loss()
             self.scaler.scale(engine.state.output[Keys.LOSS]).backward()
             self.scaler.unscale_(self.optimizer)
diff --git a/modules/interpretability/cats_and_dogs.ipynb b/modules/interpretability/cats_and_dogs.ipynb
@@ -291,7 +291,7 @@
     "optimizer = torch.optim.Adam(model.parameters(), 1e-5)\n",
     "use_amp = True\n",
     "label_dtype = torch.float16 if use_amp else torch.float32\n",
-    "scaler = torch.cuda.amp.GradScaler() if use_amp else None\n",
+    "scaler = torch.GradScaler(\"cuda\") if use_amp else None\n",
     "\n",
     "\n",
     "def criterion(y_pred, y):\n",
@@ -317,7 +317,7 @@
     "    for data in dl:\n",
     "        inputs, labels = data[\"image\"].to(device), data[\"label\"].to(device)\n",
     "        optimizer.zero_grad()\n",
-    "        with torch.cuda.amp.autocast() if use_amp else nullcontext():\n",
+    "        with torch.autocast(\"cuda\", enabled=use_amp):\n",
     "            outputs = model(inputs)\n",
     "            train_loss = criterion(outputs, labels)\n",
     "            acc += get_num_correct(outputs, labels)\n",
@@ -350,7 +350,7 @@
     "\n",
     "    for data in tqdm(dl):\n",
     "        images, labels = data[\"image\"].to(device), data[\"label\"].to(device)\n",
-    "        with torch.cuda.amp.autocast() if use_amp else nullcontext():\n",
+    "        with torch.autocast(\"cuda\", enabled=use_amp):\n",
     "            outputs = model(images).detach()\n",
     "        y_pred = torch.cat([y_pred, outputs], dim=0)\n",
     "        y = torch.cat([y, labels], dim=0)\n",
diff --git a/modules/interpretability/class_lung_lesion.ipynb b/modules/interpretability/class_lung_lesion.ipynb
@@ -406,7 +406,7 @@
     "best_metric = best_metric_epoch = -1\n",
     "epoch_loss_values = []\n",
     "metric_values = []\n",
-    "scaler = torch.cuda.amp.GradScaler()\n",
+    "scaler = torch.GradScaler(\"cuda\")\n",
     "for epoch in range(max_epochs):\n",
     "    clear_output()\n",
     "    print(\"-\" * 10)\n",
@@ -419,7 +419,7 @@
     "            batch_data[\"label\"].to(device),\n",
     "        )\n",
     "        optimizer.zero_grad()\n",
-    "        with torch.cuda.amp.autocast():\n",
+    "        with torch.autocast(\"cuda\"):\n",
     "            outputs = model(inputs)\n",
     "            loss = criterion(outputs.float(), labels.float())\n",
     "        scaler.scale(loss).backward()\n",
diff --git a/modules/interpretability/covid_classification.ipynb b/modules/interpretability/covid_classification.ipynb
diff --git a/pathology/hovernet/hovernet_torch.ipynb b/pathology/hovernet/hovernet_torch.ipynb
diff --git a/pathology/multiple_instance_learning/panda_mil_train_evaluate_pytorch_gpu.py b/pathology/multiple_instance_learning/panda_mil_train_evaluate_pytorch_gpu.py
diff --git a/pathology/tumor_detection/torch/camelyon_train_evaluate_pytorch_gpu.py b/pathology/tumor_detection/torch/camelyon_train_evaluate_pytorch_gpu.py
diff --git a/performance_profiling/pathology/train_evaluate_nvtx.py b/performance_profiling/pathology/train_evaluate_nvtx.py
diff --git a/performance_profiling/radiology/train_fast_nvtx.py b/performance_profiling/radiology/train_fast_nvtx.py
diff --git a/vista_2d/vista_2d_tutorial_monai.ipynb b/vista_2d/vista_2d_tutorial_monai.ipynb