From 72d0de7a52d19ea00f7a014d4d0b1edfa6e3b96e Mon Sep 17 00:00:00 2001 From: Terrasse Date: Fri, 4 Apr 2025 01:39:10 +0800 Subject: [PATCH] fix: use mtime by default in Trainer._rotate_checkpoints with automatic fallback --- src/transformers/trainer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 9d4f553047bd..0017186b868f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3227,9 +3227,8 @@ def _save_checkpoint(self, model, trial): # Maybe delete some older checkpoints. if self.args.should_save: - # Solely rely on numerical checkpoint id for rotation. - # mtime is not reliable especially on some fuse fs in cloud environments. - self._rotate_checkpoints(use_mtime=False, output_dir=run_dir) + # we use mtime as default, filesystems without mtime support will be detected in `_sorted_checkpoints` + self._rotate_checkpoints(use_mtime=True, output_dir=run_dir) def _save_rng_state(self, output_dir): # Save RNG state in non-distributed training @@ -4042,7 +4041,17 @@ def _sorted_checkpoints( ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) checkpoints_sorted = sorted(ordering_and_checkpoint_path) + # mtime is not reliable on all filesystems, especially on some fuse fs in cloud environments + # so we check if the mtime is fake and fallback to numerical ordering if needed + if use_mtime and len(ordering_and_checkpoint_path) > 1: + mtime_diff = checkpoints_sorted[-1][0] - checkpoints_sorted[0][0] + if mtime_diff < 1.0: # less than 1 second, which is almost impossible when mtime works fine + warnings.warn("mtime may not be reliable on this filesystem, falling back to numerical ordering") + return self._sorted_checkpoints( + use_mtime=False, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix + ) checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] + # Make sure we don't delete the best model. if ( self.state.best_model_checkpoint is not None