|
15 | 15 | from torchmetrics import Metric, MeanMetric |
16 | 16 | import lightning.pytorch as pl |
17 | 17 | from lightning.pytorch.utilities.rank_zero import rank_zero_debug, rank_zero_info, rank_zero_only |
18 | | -from lightning.pytorch.callbacks import Callback |
19 | 18 |
|
20 | 19 | from basics.base_module import CategorizedModule |
21 | 20 | from utils.hparams import hparams |
|
33 | 32 | format=log_format, datefmt='%m/%d %I:%M:%S %p') |
34 | 33 |
|
35 | 34 |
|
36 | | -class OptimizerTimerCallback(Callback): |
37 | | - def __init__(self): |
38 | | - super().__init__() |
39 | | - # 使用 CUDA Event 确保获取的是 GPU 真实执行时间,而非 CPU 发射时间 |
40 | | - self.start_event = torch.cuda.Event(enable_timing=True) |
41 | | - self.end_event = torch.cuda.Event(enable_timing=True) |
42 | | - |
43 | | - def on_before_optimizer_step(self, trainer, pl_module, optimizer): |
44 | | - # 只在第一个 Epoch 之后开始计时 |
45 | | - if trainer.current_epoch > 0: |
46 | | - self.start_event.record() |
47 | | - |
48 | | - def on_after_optimizer_step(self, trainer, pl_module, optimizer): |
49 | | - if trainer.current_epoch > 0: |
50 | | - self.end_event.record() |
51 | | - torch.cuda.synchronize() # 等待 GPU 完成该 Step 的所有计算 |
52 | | - |
53 | | - # 计算耗时(毫秒) |
54 | | - epoch_time_ms = self.start_event.elapsed_time(self.end_event) |
55 | | - |
56 | | - # 记录到 TensorBoard |
57 | | - # pl_module.log 会自动寻找当前配置的 Logger (如 TensorBoardLogger) |
58 | | - pl_module.log( |
59 | | - "stats/optimizer_step_duration_ms", |
60 | | - epoch_time_ms, |
61 | | - on_step=True, |
62 | | - on_epoch=False, |
63 | | - prog_bar=True |
64 | | - ) |
65 | | - |
66 | | - |
67 | 35 | class BaseTask(pl.LightningModule): |
68 | 36 | """ |
69 | 37 | Base class for training tasks. |
@@ -455,7 +423,6 @@ def start(cls): |
455 | 423 | ), |
456 | 424 | # LearningRateMonitor(logging_interval='step'), |
457 | 425 | DsTQDMProgressBar(), |
458 | | - OptimizerTimerCallback(), |
459 | 426 | ], |
460 | 427 | logger=DsTensorBoardLogger( |
461 | 428 | save_dir=str(work_dir), |
|
0 commit comments