[timecode] Fix VFR timing, persistent decoder, and output command accuracy

Breakthrough · Breakthrough · commit 590df64c8752 · 2026-04-05T14:28:39.000-04:00
- Fix PyAV read() to reuse persistent decoder generator, preventing the last
  frame from being dropped at EOF due to B-frame buffer flush on GC
- Fix _handle_eof() to seek by PTS seconds instead of frame number (which is
  now a CFR-equivalent approximation, not a decode count)
- Fix get_timecode() to skip nearest-frame snapping for Timecode-backed
  FrameTimecodes, so scene boundary timecodes are PTS-accurate for VFR
- Fix FlashFilter to cache min_scene_len threshold in seconds from first frame's
  framerate, avoiding incorrect thresholds when OpenCV reports wrong average fps
- Fix FCP7 XML: use seconds*fps for frame numbers, dynamic NTSC flag
- Fix OTIO: use seconds*frame_rate for RationalTime values
- Add $START_PTS and $END_PTS (ms) to split-video filename templates
- Refactor test_vfr.py: use open_video(), add EXPECTED_SCENES_VFR ground truth,
  parameterize scene detection test for both pyav and opencv backends
diff --git a/scenedetect/_cli/commands.py b/scenedetect/_cli/commands.py
@@ -401,18 +401,19 @@ def _save_xml_fcp(
     sequence = ElementTree.SubElement(project, "sequence")
     ElementTree.SubElement(sequence, "name").text = context.video_stream.name
 
+    fps = float(context.video_stream.frame_rate)
+    ntsc = "True" if context.video_stream.frame_rate.denominator != 1 else "False"
     duration = scenes[-1][1] - scenes[0][0]
-    ElementTree.SubElement(sequence, "duration").text = f"{duration.frame_num}"
+    ElementTree.SubElement(sequence, "duration").text = str(round(duration.seconds * fps))
 
     rate = ElementTree.SubElement(sequence, "rate")
-    fps = float(context.video_stream.frame_rate)
     ElementTree.SubElement(rate, "timebase").text = str(round(fps))
-    ElementTree.SubElement(rate, "ntsc").text = "False"
+    ElementTree.SubElement(rate, "ntsc").text = ntsc
 
     timecode = ElementTree.SubElement(sequence, "timecode")
     tc_rate = ElementTree.SubElement(timecode, "rate")
     ElementTree.SubElement(tc_rate, "timebase").text = str(round(fps))
-    ElementTree.SubElement(tc_rate, "ntsc").text = "False"
+    ElementTree.SubElement(tc_rate, "ntsc").text = ntsc
     ElementTree.SubElement(timecode, "frame").text = "0"
     ElementTree.SubElement(timecode, "displayformat").text = "NDF"
 
@@ -430,11 +431,11 @@ def _save_xml_fcp(
         ElementTree.SubElement(clip, "rate").append(
             ElementTree.fromstring(f"<timebase>{round(fps)}</timebase>")
         )
-        # TODO: Are these supposed to be frame numbers or another format?
-        ElementTree.SubElement(clip, "start").text = str(start.frame_num)
-        ElementTree.SubElement(clip, "end").text = str(end.frame_num)
-        ElementTree.SubElement(clip, "in").text = str(start.frame_num)
-        ElementTree.SubElement(clip, "out").text = str(end.frame_num)
+        # Frame numbers relative to the declared <timebase> fps, computed from PTS seconds.
+        ElementTree.SubElement(clip, "start").text = str(round(start.seconds * fps))
+        ElementTree.SubElement(clip, "end").text = str(round(end.seconds * fps))
+        ElementTree.SubElement(clip, "in").text = str(round(start.seconds * fps))
+        ElementTree.SubElement(clip, "out").text = str(round(end.seconds * fps))
 
         file_ref = ElementTree.SubElement(clip, "file", id=f"file{i + 1}")
         ElementTree.SubElement(file_ref, "name").text = context.video_stream.name
@@ -535,12 +536,12 @@ def save_otio(
                                 "duration": {
                                     "OTIO_SCHEMA": "RationalTime.1",
                                     "rate": frame_rate,
-                                    "value": float((end - start).frame_num),
+                                    "value": (end - start).seconds * frame_rate,
                                 },
                                 "start_time": {
                                     "OTIO_SCHEMA": "RationalTime.1",
                                     "rate": frame_rate,
-                                    "value": float(start.frame_num),
+                                    "value": start.seconds * frame_rate,
                                 },
                             },
                             "enabled": True,
diff --git a/scenedetect/backends/pyav.py b/scenedetect/backends/pyav.py
@@ -81,6 +81,8 @@ def __init__(
         self._name = "" if name is None else name
         self._path = ""
         self._frame: ty.Optional[av.VideoFrame] = None
+        self._decoder: ty.Optional[ty.Generator] = None
+        self._decode_count: int = 0
         self._reopened = True
 
         if threading_mode:
@@ -197,14 +199,13 @@ def position_ms(self) -> float:
 
     @property
     def frame_number(self) -> int:
-        """Current position within stream as the frame number.
+        """Current position within stream as the frame number (CFR-equivalent).
 
-        Will return 0 until the first frame is `read`."""
-
-        if self._frame:
-            # frame_number is 1-indexed, so add 1 to the 0-based frame position.
-            return round(self._frame.time * self.frame_rate) + 1
-        return 0
+        Will return 0 until the first frame is `read`. For VFR video this is an approximation
+        derived from PTS × framerate; use `position` for accurate PTS-based timing."""
+        if self._frame is None:
+            return 0
+        return round(self._frame.time * float(self.frame_rate)) + 1
 
     @property
     def rate(self) -> Fraction:
@@ -261,6 +262,8 @@ def seek(self, target: ty.Union[FrameTimecode, float, int]) -> None:
             (self.base_timecode + target).seconds / self._video_stream.time_base
         )
         self._frame = None
+        self._decoder = None
+        self._decode_count = 0
         self._container.seek(target_pts, stream=self._video_stream)
         if not beginning:
             self.read(decode=False)
@@ -272,15 +275,23 @@ def reset(self):
         """Close and re-open the VideoStream (should be equivalent to calling `seek(0)`)."""
         self._container.close()
         self._frame = None
+        self._decoder = None
+        self._decode_count = 0
         try:
             self._container = av.open(self._path if self._path else self._io)
         except Exception as ex:
             raise VideoOpenFailure() from ex
 
     def read(self, decode: bool = True) -> ty.Union[np.ndarray, bool]:
+        # Reuse a persistent decoder generator so the codec's internal frame buffer (used for
+        # B-frame reordering) is never flushed prematurely. Creating a new generator each call
+        # caused the last buffered frame to be lost at EOF.
+        if self._decoder is None:
+            self._decoder = self._container.decode(video=0)
         try:
             last_frame = self._frame
-            self._frame = next(self._container.decode(video=0))
+            self._frame = next(self._decoder)
+            self._decode_count += 1
         except av.error.EOFError:
             self._frame = last_frame
             if self._handle_eof():
@@ -345,7 +356,7 @@ def _handle_eof(self):
         # Don't re-open the video if we can't seek or aren't in AUTO/FRAME thread_type mode.
         if not self.is_seekable or self._video_stream.thread_type not in ("AUTO", "FRAME"):
             return False
-        last_frame = self.frame_number
+        last_pos_secs = self.position.seconds
         orig_pos = self._io.tell()
         try:
             self._io.seek(0)
@@ -355,5 +366,6 @@ def _handle_eof(self):
             raise
         self._container.close()
         self._container = container
-        self.seek(last_frame)
+        self._decoder = None
+        self.seek(last_pos_secs)
         return True
diff --git a/scenedetect/common.py b/scenedetect/common.py
@@ -375,7 +375,10 @@ def get_timecode(
             str: The current time in the form ``"HH:MM:SS[.nnn]"``.
         """
         # Compute hours and minutes based off of seconds, and update seconds.
-        if nearest_frame and self.framerate:
+        # For PTS-backed timecodes, the PTS already represents an exact frame boundary, so we use
+        # `seconds` directly. For non-PTS timecodes, `nearest_frame` snaps to the nearest frame
+        # boundary using frame_num, which avoids floating point drift in CFR video display.
+        if nearest_frame and self.framerate and not isinstance(self._time, Timecode):
             secs = self.frame_num / self.framerate
         else:
             secs = self.seconds
diff --git a/scenedetect/detector.py b/scenedetect/detector.py
@@ -122,6 +122,7 @@ def __init__(self, mode: Mode, length: int):
         """
         self._mode = mode
         self._filter_length = length  # Number of frames to use for activating the filter.
+        self._filter_secs: ty.Optional[float] = None  # Threshold in seconds, computed on first use.
         self._last_above = None  # Last frame above threshold.
         self._merge_enabled = False  # Used to disable merging until at least one cut was found.
         self._merge_triggered = False  # True when the merge filter is active.
@@ -143,9 +144,12 @@ def filter(self, timecode: FrameTimecode, above_threshold: bool) -> ty.List[Fram
         raise RuntimeError("Unhandled FlashFilter mode.")
 
     def _filter_suppress(self, timecode: FrameTimecode, above_threshold: bool) -> ty.List[int]:
-        framerate = timecode.framerate
-        assert framerate >= 0
-        min_length_met: bool = (timecode - self._last_above) >= (self._filter_length / framerate)
+        assert timecode.framerate >= 0
+        # Compute the threshold in seconds once from the first frame's framerate. This avoids
+        # using an incorrect average fps (e.g. OpenCV on VFR video) on subsequent frames.
+        if self._filter_secs is None:
+            self._filter_secs = self._filter_length / timecode.framerate
+        min_length_met: bool = (timecode - self._last_above) >= self._filter_secs
         if not (above_threshold and min_length_met):
             return []
         # Both length and threshold requirements were satisfied. Emit the cut, and wait until both
@@ -154,16 +158,17 @@ def _filter_suppress(self, timecode: FrameTimecode, above_threshold: bool) -> ty
         return [timecode]
 
     def _filter_merge(self, timecode: FrameTimecode, above_threshold: bool) -> ty.List[int]:
-        framerate = timecode.framerate
-        assert framerate >= 0
-        min_length_met: bool = (timecode - self._last_above) >= (self._filter_length / framerate)
+        assert timecode.framerate >= 0
+        # Compute the threshold in seconds once from the first frame's framerate.
+        if self._filter_secs is None:
+            self._filter_secs = self._filter_length / timecode.framerate
+        min_length_met: bool = (timecode - self._last_above) >= self._filter_secs
         # Ensure last frame is always advanced to the most recent one that was above the threshold.
         if above_threshold:
             self._last_above = timecode
         if self._merge_triggered:
             # This frame was under the threshold, see if enough frames passed to disable the filter.
-            num_merged_frames = self._last_above - self._merge_start
-            if min_length_met and not above_threshold and num_merged_frames >= self._filter_length:
+            if min_length_met and not above_threshold and (self._last_above - self._merge_start) >= self._filter_secs:
                 self._merge_triggered = False
                 return [self._last_above]
             # Keep merging until enough frames pass below the threshold.
diff --git a/scenedetect/output/video.py b/scenedetect/output/video.py
@@ -125,7 +125,8 @@ class SceneMetadata:
 def default_formatter(template: str) -> PathFormatter:
     """Formats filenames using a template string which allows the following variables:
 
-    `$VIDEO_NAME`, `$SCENE_NUMBER`, `$START_TIME`, `$END_TIME`, `$START_FRAME`, `$END_FRAME`
+    `$VIDEO_NAME`, `$SCENE_NUMBER`, `$START_TIME`, `$END_TIME`, `$START_FRAME`, `$END_FRAME`,
+    `$START_PTS`, `$END_PTS` (presentation timestamp in milliseconds, accurate for VFR video)
     """
     MIN_DIGITS = 3
     format_scene_number: PathFormatter = lambda video, scene: (
@@ -139,6 +140,8 @@ def default_formatter(template: str) -> PathFormatter:
         END_TIME=str(scene.end.get_timecode().replace(":", ";")),
         START_FRAME=str(scene.start.frame_num),
         END_FRAME=str(scene.end.frame_num),
+        START_PTS=str(round(scene.start.seconds * 1000)),
+        END_PTS=str(round(scene.end.seconds * 1000)),
     )
     return formatter
 
diff --git a/tests/test_vfr.py b/tests/test_vfr.py
@@ -17,38 +17,33 @@
 
 import pytest
 
-from scenedetect import SceneManager
+from scenedetect import SceneManager, open_video
 from scenedetect.common import FrameTimecode, Timecode
 from scenedetect.detectors import ContentDetector
 from scenedetect.stats_manager import StatsManager
 
-
-def _open_pyav(path: str):
-    """Open a video with the PyAV backend."""
-    from scenedetect.backends.pyav import VideoStreamAv
-
-    return VideoStreamAv(path)
-
-
-def _open_opencv(path: str):
-    """Open a video with the OpenCV backend."""
-    from scenedetect.backends.opencv import VideoStreamCv2
-
-    return VideoStreamCv2(path)
+# Expected scene cuts for `goldeneye-vfr.mp4` detected with ContentDetector() and end_time=10.0s.
+# Entries are (start_timecode, end_timecode). All backends should agree on cut timecodes since
+# CAP_PROP_POS_MSEC gives accurate PTS-derived timestamps. The last scene ends at the clip
+# boundary (end_time) which may vary slightly between backends based on frame counting.
+EXPECTED_SCENES_VFR: ty.List[ty.Tuple[str, str]] = [
+    ("00:00:00.000", "00:00:03.921"),
+    ("00:00:03.921", "00:00:09.676"),
+]
 
 
 class TestVFR:
     """Test VFR video handling."""
 
     def test_vfr_position_is_timecode(self, test_vfr_video: str):
         """Position should be a Timecode-backed FrameTimecode."""
-        video = _open_pyav(test_vfr_video)
+        video = open_video(test_vfr_video, backend="pyav")
         assert video.read() is not False
         assert isinstance(video.position._time, Timecode)
 
     def test_vfr_position_monotonic_pyav(self, test_vfr_video: str):
-        """PTS-based position should be monotonically non-decreasing."""
-        video = _open_pyav(test_vfr_video)
+        """PTS-based position should be monotonically non-decreasing (PyAV)."""
+        video = open_video(test_vfr_video, backend="pyav")
         last_seconds = -1.0
         frame_count = 0
         while True:
@@ -64,8 +59,8 @@ def test_vfr_position_monotonic_pyav(self, test_vfr_video: str):
         assert frame_count > 0
 
     def test_vfr_position_monotonic_opencv(self, test_vfr_video: str):
-        """PTS-based position should be monotonically non-decreasing with OpenCV."""
-        video = _open_opencv(test_vfr_video)
+        """PTS-based position should be monotonically non-decreasing (OpenCV)."""
+        video = open_video(test_vfr_video, backend="opencv")
         last_seconds = -1.0
         frame_count = 0
         while True:
@@ -80,23 +75,36 @@ def test_vfr_position_monotonic_opencv(self, test_vfr_video: str):
             frame_count += 1
         assert frame_count > 0
 
-    def test_vfr_scene_detection(self, test_vfr_video: str):
-        """Scene detection should work on VFR video and produce reasonable timestamps."""
-        video = _open_pyav(test_vfr_video)
+    @pytest.mark.parametrize("backend", ["pyav", "opencv"])
+    def test_vfr_scene_detection(self, test_vfr_video: str, backend: str):
+        """Scene detection on VFR video should produce timestamps matching known ground truth.
+
+        Both PyAV (native PTS) and OpenCV (CAP_PROP_POS_MSEC) should agree on scene cuts since
+        both expose accurate PTS-derived timestamps.
+        """
+        video = open_video(test_vfr_video, backend=backend)
         sm = SceneManager()
         sm.add_detector(ContentDetector())
-        sm.detect_scenes(video=video)
+        sm.detect_scenes(video=video, end_time=10.0)
         scene_list = sm.get_scene_list()
-        # Should detect at least one scene.
-        assert len(scene_list) > 0
-        # All timestamps should be non-negative and within video duration.
-        for start, end in scene_list:
-            assert start.seconds >= 0
-            assert end.seconds > start.seconds
+
+        # The last scene ends at the clip boundary which may vary by backend; only check known cuts.
+        assert len(scene_list) >= len(EXPECTED_SCENES_VFR), (
+            f"[{backend}] Expected at least {len(EXPECTED_SCENES_VFR)} scenes, got {len(scene_list)}"
+        )
+        for i, ((start, end), (exp_start_tc, exp_end_tc)) in enumerate(
+            zip(scene_list, EXPECTED_SCENES_VFR, strict=False)
+        ):
+            assert start.get_timecode() == exp_start_tc, (
+                f"[{backend}] Scene {i + 1} start: expected {exp_start_tc!r}, got {start.get_timecode()!r}"
+            )
+            assert end.get_timecode() == exp_end_tc, (
+                f"[{backend}] Scene {i + 1} end: expected {exp_end_tc!r}, got {end.get_timecode()!r}"
+            )
 
     def test_vfr_seek_pyav(self, test_vfr_video: str):
         """Seeking should work with VFR video."""
-        video = _open_pyav(test_vfr_video)
+        video = open_video(test_vfr_video, backend="pyav")
         target_time = 2.0  # seconds
         video.seek(target_time)
         frame = video.read()
@@ -106,20 +114,18 @@ def test_vfr_seek_pyav(self, test_vfr_video: str):
 
     def test_vfr_stats_manager(self, test_vfr_video: str):
         """StatsManager should work correctly with VFR video."""
-        video = _open_pyav(test_vfr_video)
+        video = open_video(test_vfr_video, backend="pyav")
         stats = StatsManager()
         sm = SceneManager(stats_manager=stats)
         sm.add_detector(ContentDetector())
         sm.detect_scenes(video=video)
-        # Stats should have metrics for frames.
-        scene_list = sm.get_scene_list()
-        assert len(scene_list) > 0
+        assert len(sm.get_scene_list()) > 0
 
     def test_vfr_csv_output(self, test_vfr_video: str, tmp_path):
         """CSV export should work correctly with VFR video."""
         from scenedetect.output import write_scene_list
 
-        video = _open_pyav(test_vfr_video)
+        video = open_video(test_vfr_video, backend="pyav")
         sm = SceneManager()
         sm.add_detector(ContentDetector())
         sm.detect_scenes(video=video)
@@ -134,18 +140,17 @@ def test_vfr_csv_output(self, test_vfr_video: str, tmp_path):
         with open(csv_path, "r") as f:
             reader = csv.reader(f)
             rows = list(reader)
-            # Header + at least one data row.
             assert len(rows) >= 3  # 2 header rows + data
 
     def test_cfr_position_is_timecode(self, test_movie_clip: str):
         """CFR video positions should also be Timecode-backed with PTS support."""
-        video = _open_pyav(test_movie_clip)
+        video = open_video(test_movie_clip, backend="pyav")
         assert video.read() is not False
         assert isinstance(video.position._time, Timecode)
 
     def test_cfr_frame_num_exact(self, test_movie_clip: str):
         """For CFR video, frame_num should be exact (not approximate)."""
-        video = _open_pyav(test_movie_clip)
+        video = open_video(test_movie_clip, backend="pyav")
         for expected_frame in range(1, 11):
             assert video.read() is not False
             assert video.position.frame_num == expected_frame - 1