Skip to content

Commit 590df64

Browse files
committed
[timecode] Fix VFR timing, persistent decoder, and output command accuracy
- Fix PyAV read() to reuse persistent decoder generator, preventing the last frame from being dropped at EOF due to B-frame buffer flush on GC - Fix _handle_eof() to seek by PTS seconds instead of frame number (which is now a CFR-equivalent approximation, not a decode count) - Fix get_timecode() to skip nearest-frame snapping for Timecode-backed FrameTimecodes, so scene boundary timecodes are PTS-accurate for VFR - Fix FlashFilter to cache min_scene_len threshold in seconds from first frame's framerate, avoiding incorrect thresholds when OpenCV reports wrong average fps - Fix FCP7 XML: use seconds*fps for frame numbers, dynamic NTSC flag - Fix OTIO: use seconds*frame_rate for RationalTime values - Add $START_PTS and $END_PTS (ms) to split-video filename templates - Refactor test_vfr.py: use open_video(), add EXPECTED_SCENES_VFR ground truth, parameterize scene detection test for both pyav and opencv backends
1 parent 38f59c8 commit 590df64

6 files changed

Lines changed: 98 additions & 69 deletions

File tree

scenedetect/_cli/commands.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -401,18 +401,19 @@ def _save_xml_fcp(
401401
sequence = ElementTree.SubElement(project, "sequence")
402402
ElementTree.SubElement(sequence, "name").text = context.video_stream.name
403403

404+
fps = float(context.video_stream.frame_rate)
405+
ntsc = "True" if context.video_stream.frame_rate.denominator != 1 else "False"
404406
duration = scenes[-1][1] - scenes[0][0]
405-
ElementTree.SubElement(sequence, "duration").text = f"{duration.frame_num}"
407+
ElementTree.SubElement(sequence, "duration").text = str(round(duration.seconds * fps))
406408

407409
rate = ElementTree.SubElement(sequence, "rate")
408-
fps = float(context.video_stream.frame_rate)
409410
ElementTree.SubElement(rate, "timebase").text = str(round(fps))
410-
ElementTree.SubElement(rate, "ntsc").text = "False"
411+
ElementTree.SubElement(rate, "ntsc").text = ntsc
411412

412413
timecode = ElementTree.SubElement(sequence, "timecode")
413414
tc_rate = ElementTree.SubElement(timecode, "rate")
414415
ElementTree.SubElement(tc_rate, "timebase").text = str(round(fps))
415-
ElementTree.SubElement(tc_rate, "ntsc").text = "False"
416+
ElementTree.SubElement(tc_rate, "ntsc").text = ntsc
416417
ElementTree.SubElement(timecode, "frame").text = "0"
417418
ElementTree.SubElement(timecode, "displayformat").text = "NDF"
418419

@@ -430,11 +431,11 @@ def _save_xml_fcp(
430431
ElementTree.SubElement(clip, "rate").append(
431432
ElementTree.fromstring(f"<timebase>{round(fps)}</timebase>")
432433
)
433-
# TODO: Are these supposed to be frame numbers or another format?
434-
ElementTree.SubElement(clip, "start").text = str(start.frame_num)
435-
ElementTree.SubElement(clip, "end").text = str(end.frame_num)
436-
ElementTree.SubElement(clip, "in").text = str(start.frame_num)
437-
ElementTree.SubElement(clip, "out").text = str(end.frame_num)
434+
# Frame numbers relative to the declared <timebase> fps, computed from PTS seconds.
435+
ElementTree.SubElement(clip, "start").text = str(round(start.seconds * fps))
436+
ElementTree.SubElement(clip, "end").text = str(round(end.seconds * fps))
437+
ElementTree.SubElement(clip, "in").text = str(round(start.seconds * fps))
438+
ElementTree.SubElement(clip, "out").text = str(round(end.seconds * fps))
438439

439440
file_ref = ElementTree.SubElement(clip, "file", id=f"file{i + 1}")
440441
ElementTree.SubElement(file_ref, "name").text = context.video_stream.name
@@ -535,12 +536,12 @@ def save_otio(
535536
"duration": {
536537
"OTIO_SCHEMA": "RationalTime.1",
537538
"rate": frame_rate,
538-
"value": float((end - start).frame_num),
539+
"value": (end - start).seconds * frame_rate,
539540
},
540541
"start_time": {
541542
"OTIO_SCHEMA": "RationalTime.1",
542543
"rate": frame_rate,
543-
"value": float(start.frame_num),
544+
"value": start.seconds * frame_rate,
544545
},
545546
},
546547
"enabled": True,

scenedetect/backends/pyav.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ def __init__(
8181
self._name = "" if name is None else name
8282
self._path = ""
8383
self._frame: ty.Optional[av.VideoFrame] = None
84+
self._decoder: ty.Optional[ty.Generator] = None
85+
self._decode_count: int = 0
8486
self._reopened = True
8587

8688
if threading_mode:
@@ -197,14 +199,13 @@ def position_ms(self) -> float:
197199

198200
@property
199201
def frame_number(self) -> int:
200-
"""Current position within stream as the frame number.
202+
"""Current position within stream as the frame number (CFR-equivalent).
201203
202-
Will return 0 until the first frame is `read`."""
203-
204-
if self._frame:
205-
# frame_number is 1-indexed, so add 1 to the 0-based frame position.
206-
return round(self._frame.time * self.frame_rate) + 1
207-
return 0
204+
Will return 0 until the first frame is `read`. For VFR video this is an approximation
205+
derived from PTS × framerate; use `position` for accurate PTS-based timing."""
206+
if self._frame is None:
207+
return 0
208+
return round(self._frame.time * float(self.frame_rate)) + 1
208209

209210
@property
210211
def rate(self) -> Fraction:
@@ -261,6 +262,8 @@ def seek(self, target: ty.Union[FrameTimecode, float, int]) -> None:
261262
(self.base_timecode + target).seconds / self._video_stream.time_base
262263
)
263264
self._frame = None
265+
self._decoder = None
266+
self._decode_count = 0
264267
self._container.seek(target_pts, stream=self._video_stream)
265268
if not beginning:
266269
self.read(decode=False)
@@ -272,15 +275,23 @@ def reset(self):
272275
"""Close and re-open the VideoStream (should be equivalent to calling `seek(0)`)."""
273276
self._container.close()
274277
self._frame = None
278+
self._decoder = None
279+
self._decode_count = 0
275280
try:
276281
self._container = av.open(self._path if self._path else self._io)
277282
except Exception as ex:
278283
raise VideoOpenFailure() from ex
279284

280285
def read(self, decode: bool = True) -> ty.Union[np.ndarray, bool]:
286+
# Reuse a persistent decoder generator so the codec's internal frame buffer (used for
287+
# B-frame reordering) is never flushed prematurely. Creating a new generator each call
288+
# caused the last buffered frame to be lost at EOF.
289+
if self._decoder is None:
290+
self._decoder = self._container.decode(video=0)
281291
try:
282292
last_frame = self._frame
283-
self._frame = next(self._container.decode(video=0))
293+
self._frame = next(self._decoder)
294+
self._decode_count += 1
284295
except av.error.EOFError:
285296
self._frame = last_frame
286297
if self._handle_eof():
@@ -345,7 +356,7 @@ def _handle_eof(self):
345356
# Don't re-open the video if we can't seek or aren't in AUTO/FRAME thread_type mode.
346357
if not self.is_seekable or self._video_stream.thread_type not in ("AUTO", "FRAME"):
347358
return False
348-
last_frame = self.frame_number
359+
last_pos_secs = self.position.seconds
349360
orig_pos = self._io.tell()
350361
try:
351362
self._io.seek(0)
@@ -355,5 +366,6 @@ def _handle_eof(self):
355366
raise
356367
self._container.close()
357368
self._container = container
358-
self.seek(last_frame)
369+
self._decoder = None
370+
self.seek(last_pos_secs)
359371
return True

scenedetect/common.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,10 @@ def get_timecode(
375375
str: The current time in the form ``"HH:MM:SS[.nnn]"``.
376376
"""
377377
# Compute hours and minutes based off of seconds, and update seconds.
378-
if nearest_frame and self.framerate:
378+
# For PTS-backed timecodes, the PTS already represents an exact frame boundary, so we use
379+
# `seconds` directly. For non-PTS timecodes, `nearest_frame` snaps to the nearest frame
380+
# boundary using frame_num, which avoids floating point drift in CFR video display.
381+
if nearest_frame and self.framerate and not isinstance(self._time, Timecode):
379382
secs = self.frame_num / self.framerate
380383
else:
381384
secs = self.seconds

scenedetect/detector.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def __init__(self, mode: Mode, length: int):
122122
"""
123123
self._mode = mode
124124
self._filter_length = length # Number of frames to use for activating the filter.
125+
self._filter_secs: ty.Optional[float] = None # Threshold in seconds, computed on first use.
125126
self._last_above = None # Last frame above threshold.
126127
self._merge_enabled = False # Used to disable merging until at least one cut was found.
127128
self._merge_triggered = False # True when the merge filter is active.
@@ -143,9 +144,12 @@ def filter(self, timecode: FrameTimecode, above_threshold: bool) -> ty.List[Fram
143144
raise RuntimeError("Unhandled FlashFilter mode.")
144145

145146
def _filter_suppress(self, timecode: FrameTimecode, above_threshold: bool) -> ty.List[int]:
146-
framerate = timecode.framerate
147-
assert framerate >= 0
148-
min_length_met: bool = (timecode - self._last_above) >= (self._filter_length / framerate)
147+
assert timecode.framerate >= 0
148+
# Compute the threshold in seconds once from the first frame's framerate. This avoids
149+
# using an incorrect average fps (e.g. OpenCV on VFR video) on subsequent frames.
150+
if self._filter_secs is None:
151+
self._filter_secs = self._filter_length / timecode.framerate
152+
min_length_met: bool = (timecode - self._last_above) >= self._filter_secs
149153
if not (above_threshold and min_length_met):
150154
return []
151155
# Both length and threshold requirements were satisfied. Emit the cut, and wait until both
@@ -154,16 +158,17 @@ def _filter_suppress(self, timecode: FrameTimecode, above_threshold: bool) -> ty
154158
return [timecode]
155159

156160
def _filter_merge(self, timecode: FrameTimecode, above_threshold: bool) -> ty.List[int]:
157-
framerate = timecode.framerate
158-
assert framerate >= 0
159-
min_length_met: bool = (timecode - self._last_above) >= (self._filter_length / framerate)
161+
assert timecode.framerate >= 0
162+
# Compute the threshold in seconds once from the first frame's framerate.
163+
if self._filter_secs is None:
164+
self._filter_secs = self._filter_length / timecode.framerate
165+
min_length_met: bool = (timecode - self._last_above) >= self._filter_secs
160166
# Ensure last frame is always advanced to the most recent one that was above the threshold.
161167
if above_threshold:
162168
self._last_above = timecode
163169
if self._merge_triggered:
164170
# This frame was under the threshold, see if enough frames passed to disable the filter.
165-
num_merged_frames = self._last_above - self._merge_start
166-
if min_length_met and not above_threshold and num_merged_frames >= self._filter_length:
171+
if min_length_met and not above_threshold and (self._last_above - self._merge_start) >= self._filter_secs:
167172
self._merge_triggered = False
168173
return [self._last_above]
169174
# Keep merging until enough frames pass below the threshold.

scenedetect/output/video.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,8 @@ class SceneMetadata:
125125
def default_formatter(template: str) -> PathFormatter:
126126
"""Formats filenames using a template string which allows the following variables:
127127
128-
`$VIDEO_NAME`, `$SCENE_NUMBER`, `$START_TIME`, `$END_TIME`, `$START_FRAME`, `$END_FRAME`
128+
`$VIDEO_NAME`, `$SCENE_NUMBER`, `$START_TIME`, `$END_TIME`, `$START_FRAME`, `$END_FRAME`,
129+
`$START_PTS`, `$END_PTS` (presentation timestamp in milliseconds, accurate for VFR video)
129130
"""
130131
MIN_DIGITS = 3
131132
format_scene_number: PathFormatter = lambda video, scene: (
@@ -139,6 +140,8 @@ def default_formatter(template: str) -> PathFormatter:
139140
END_TIME=str(scene.end.get_timecode().replace(":", ";")),
140141
START_FRAME=str(scene.start.frame_num),
141142
END_FRAME=str(scene.end.frame_num),
143+
START_PTS=str(round(scene.start.seconds * 1000)),
144+
END_PTS=str(round(scene.end.seconds * 1000)),
142145
)
143146
return formatter
144147

tests/test_vfr.py

Lines changed: 43 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,38 +17,33 @@
1717

1818
import pytest
1919

20-
from scenedetect import SceneManager
20+
from scenedetect import SceneManager, open_video
2121
from scenedetect.common import FrameTimecode, Timecode
2222
from scenedetect.detectors import ContentDetector
2323
from scenedetect.stats_manager import StatsManager
2424

25-
26-
def _open_pyav(path: str):
27-
"""Open a video with the PyAV backend."""
28-
from scenedetect.backends.pyav import VideoStreamAv
29-
30-
return VideoStreamAv(path)
31-
32-
33-
def _open_opencv(path: str):
34-
"""Open a video with the OpenCV backend."""
35-
from scenedetect.backends.opencv import VideoStreamCv2
36-
37-
return VideoStreamCv2(path)
25+
# Expected scene cuts for `goldeneye-vfr.mp4` detected with ContentDetector() and end_time=10.0s.
26+
# Entries are (start_timecode, end_timecode). All backends should agree on cut timecodes since
27+
# CAP_PROP_POS_MSEC gives accurate PTS-derived timestamps. The last scene ends at the clip
28+
# boundary (end_time) which may vary slightly between backends based on frame counting.
29+
EXPECTED_SCENES_VFR: ty.List[ty.Tuple[str, str]] = [
30+
("00:00:00.000", "00:00:03.921"),
31+
("00:00:03.921", "00:00:09.676"),
32+
]
3833

3934

4035
class TestVFR:
4136
"""Test VFR video handling."""
4237

4338
def test_vfr_position_is_timecode(self, test_vfr_video: str):
4439
"""Position should be a Timecode-backed FrameTimecode."""
45-
video = _open_pyav(test_vfr_video)
40+
video = open_video(test_vfr_video, backend="pyav")
4641
assert video.read() is not False
4742
assert isinstance(video.position._time, Timecode)
4843

4944
def test_vfr_position_monotonic_pyav(self, test_vfr_video: str):
50-
"""PTS-based position should be monotonically non-decreasing."""
51-
video = _open_pyav(test_vfr_video)
45+
"""PTS-based position should be monotonically non-decreasing (PyAV)."""
46+
video = open_video(test_vfr_video, backend="pyav")
5247
last_seconds = -1.0
5348
frame_count = 0
5449
while True:
@@ -64,8 +59,8 @@ def test_vfr_position_monotonic_pyav(self, test_vfr_video: str):
6459
assert frame_count > 0
6560

6661
def test_vfr_position_monotonic_opencv(self, test_vfr_video: str):
67-
"""PTS-based position should be monotonically non-decreasing with OpenCV."""
68-
video = _open_opencv(test_vfr_video)
62+
"""PTS-based position should be monotonically non-decreasing (OpenCV)."""
63+
video = open_video(test_vfr_video, backend="opencv")
6964
last_seconds = -1.0
7065
frame_count = 0
7166
while True:
@@ -80,23 +75,36 @@ def test_vfr_position_monotonic_opencv(self, test_vfr_video: str):
8075
frame_count += 1
8176
assert frame_count > 0
8277

83-
def test_vfr_scene_detection(self, test_vfr_video: str):
84-
"""Scene detection should work on VFR video and produce reasonable timestamps."""
85-
video = _open_pyav(test_vfr_video)
78+
@pytest.mark.parametrize("backend", ["pyav", "opencv"])
79+
def test_vfr_scene_detection(self, test_vfr_video: str, backend: str):
80+
"""Scene detection on VFR video should produce timestamps matching known ground truth.
81+
82+
Both PyAV (native PTS) and OpenCV (CAP_PROP_POS_MSEC) should agree on scene cuts since
83+
both expose accurate PTS-derived timestamps.
84+
"""
85+
video = open_video(test_vfr_video, backend=backend)
8686
sm = SceneManager()
8787
sm.add_detector(ContentDetector())
88-
sm.detect_scenes(video=video)
88+
sm.detect_scenes(video=video, end_time=10.0)
8989
scene_list = sm.get_scene_list()
90-
# Should detect at least one scene.
91-
assert len(scene_list) > 0
92-
# All timestamps should be non-negative and within video duration.
93-
for start, end in scene_list:
94-
assert start.seconds >= 0
95-
assert end.seconds > start.seconds
90+
91+
# The last scene ends at the clip boundary which may vary by backend; only check known cuts.
92+
assert len(scene_list) >= len(EXPECTED_SCENES_VFR), (
93+
f"[{backend}] Expected at least {len(EXPECTED_SCENES_VFR)} scenes, got {len(scene_list)}"
94+
)
95+
for i, ((start, end), (exp_start_tc, exp_end_tc)) in enumerate(
96+
zip(scene_list, EXPECTED_SCENES_VFR, strict=False)
97+
):
98+
assert start.get_timecode() == exp_start_tc, (
99+
f"[{backend}] Scene {i + 1} start: expected {exp_start_tc!r}, got {start.get_timecode()!r}"
100+
)
101+
assert end.get_timecode() == exp_end_tc, (
102+
f"[{backend}] Scene {i + 1} end: expected {exp_end_tc!r}, got {end.get_timecode()!r}"
103+
)
96104

97105
def test_vfr_seek_pyav(self, test_vfr_video: str):
98106
"""Seeking should work with VFR video."""
99-
video = _open_pyav(test_vfr_video)
107+
video = open_video(test_vfr_video, backend="pyav")
100108
target_time = 2.0 # seconds
101109
video.seek(target_time)
102110
frame = video.read()
@@ -106,20 +114,18 @@ def test_vfr_seek_pyav(self, test_vfr_video: str):
106114

107115
def test_vfr_stats_manager(self, test_vfr_video: str):
108116
"""StatsManager should work correctly with VFR video."""
109-
video = _open_pyav(test_vfr_video)
117+
video = open_video(test_vfr_video, backend="pyav")
110118
stats = StatsManager()
111119
sm = SceneManager(stats_manager=stats)
112120
sm.add_detector(ContentDetector())
113121
sm.detect_scenes(video=video)
114-
# Stats should have metrics for frames.
115-
scene_list = sm.get_scene_list()
116-
assert len(scene_list) > 0
122+
assert len(sm.get_scene_list()) > 0
117123

118124
def test_vfr_csv_output(self, test_vfr_video: str, tmp_path):
119125
"""CSV export should work correctly with VFR video."""
120126
from scenedetect.output import write_scene_list
121127

122-
video = _open_pyav(test_vfr_video)
128+
video = open_video(test_vfr_video, backend="pyav")
123129
sm = SceneManager()
124130
sm.add_detector(ContentDetector())
125131
sm.detect_scenes(video=video)
@@ -134,18 +140,17 @@ def test_vfr_csv_output(self, test_vfr_video: str, tmp_path):
134140
with open(csv_path, "r") as f:
135141
reader = csv.reader(f)
136142
rows = list(reader)
137-
# Header + at least one data row.
138143
assert len(rows) >= 3 # 2 header rows + data
139144

140145
def test_cfr_position_is_timecode(self, test_movie_clip: str):
141146
"""CFR video positions should also be Timecode-backed with PTS support."""
142-
video = _open_pyav(test_movie_clip)
147+
video = open_video(test_movie_clip, backend="pyav")
143148
assert video.read() is not False
144149
assert isinstance(video.position._time, Timecode)
145150

146151
def test_cfr_frame_num_exact(self, test_movie_clip: str):
147152
"""For CFR video, frame_num should be exact (not approximate)."""
148-
video = _open_pyav(test_movie_clip)
153+
video = open_video(test_movie_clip, backend="pyav")
149154
for expected_frame in range(1, 11):
150155
assert video.read() is not False
151156
assert video.position.frame_num == expected_frame - 1

0 commit comments

Comments
 (0)