Remove Commercials with ffmpeg and PySceneDetect
Managing non-contiguous sections
Created: 2022-01-11
Updated: 2022-08-19
Background
I wanted to remove commercials from several hundred TV episodes present in a handful of recordings. Additionally, I wanted to split them per episode to be added to a media library.
Scene Detection
Detecting the transition from content to commercial is made easy using PySceneDetect.
The default settings were nearly perfect at detecting the scene transition.
Usage
PySceneDetect is easily called from the command line. However, as I was doing this in batches and wanted a way to incorporate reviewing the scenes and episode annotation with the conversion, I decided to use a Jupyter Notebook
from subprocess import Popen
def get_scenes(folder, video):
args = ["scenedetect", "-i", video, "detect-threshold", "list-scenes", "save-images", "export-html", "-w",
"320", "-h", "180"]
with Popen(args, cwd=folder) as p:
p.communicate()
As you see from the above function, PySceneDetect will generate a report of scenes detected, along with thumbnails to confirm their accuracy.
With the report finished - I use some helper functions to work with the report and integrate it into Jupyter.
Annotating Scenes
To assign scenes to a video output, I came up with a simple syntax:
scene_stmt: number
scene_range: number-number (inclusive)
scene_break: number,number
output_name: ...string
Example:
- Select scenes 1-4 (inclusive), scene 7, and scenes 8-11 (inclusive) and save to a video named 'output'
annotation = "1-4,7,8-11...output"
Annotation Parser
def parse_scene_annotation(s: str) -> list[list[tuple[int, int]], str]:
"""
Transform annotation to structured form
"""
scenes_str, output_name = s.split("...")
scenes = []
for sc_range in scenes_str.split(","):
# Split on disjointed scenes
sc_range = sc_range.strip()
if "-" not in sc_range:
# Single scene
scenes.append((int(sc_range), int(sc_range)))
continue
start, _, end = sc_range.partition("-")
scenes.append((int(start.strip()), int(end.strip())))
return [scenes, output_name]
Translate Scene Numbers to Timestamps
We will use the report from PySceneDetect
to get translate scene numbers to timestamps
import pandas as pd
from pathlib import Path
scene_report_fp = Path("~/Videos/Captures/Scenes.csv").expanduser() # Scene Report
df = pd.read_csv(scene_report_fp, skiprows=1, index_col=['Scene Number'])
def get_scene_timestamp(scene_number: int, is_end: bool, df_scenes: pd.DataFrame) -> float:
"""Depending on if this scene_number starts a range or scene_ranges it return the appropriate timestamp"""
return df_scenes.at[scene_number, f"{'End' if is_end else 'Start'} Time (seconds)"]
def get_scene_timestamps(ends, df_scenes: pd.DataFrame):
scene_times = []
for start, end in ends:
start_secs = get_scene_timestamp(start, is_end=False, df_scenes=df_scenes)
end_secs = get_scene_timestamp(start, is_end=True, df_scenes=df_scenes)
scene_times.append((start_secs, end_secs))
return scene_times
Build Filtergraph
We convert scene numbers to timestamps and prepare them for ingestion with some ffmpeg specific logic
from itertools import chain
def parse_scenes(scene_ranges: list[tuple[int, int]], df_scenes: pd.DataFrame, fast_seek_buffer: int = 100) -> tuple[
list[tuple[str, str]], str]:
"""
Fast seeking is not exact, so we want to start transcoding before the exact timestamp.
This requires recalculating our timestamps to adjust for the buffer.
Note that ffmpeg will discard the buffered portion, and will *not* be included in the output
We also want to convert our timestamps to strings, with .4f precision
"""
ends = get_scene_timestamps(scene_ranges, df_scenes=df_scenes)
ss_start = min(chain.from_iterable(ends)) - fast_seek_buffer
ss_start = max([ss_start, 0])
new_pieces: list[tuple[str, str]] = []
for (start, end) in ends:
new_start = start - ss_start
new_end = end - ss_start
start_code = "%.4f" % new_start
end_code = "%.4f" % new_end
new_pieces.append((start_code, end_code))
ss_start_code = "%.4f" % ss_start
return new_pieces, ss_start_code
Hooking in ffmpeg
Fortunately, we can build this programmatically
def make_filtergraph_pieces(scene_range: tuple[str, str], chunk: int):
"""
Trimming video and audio and setting the correcting the timestamp
"""
v = f"[0:v]trim=start={scene_range[0]}:end={scene_range[1]},setpts=PTS-STARTPTS[{chunk}v];"
a = f"[0:a]atrim=start={scene_range[0]}:end={scene_range[1]},asetpts=PTS-STARTPTS[{chunk}a];"
return v+a
def make_concat_filtergraph(n_chunks: int):
"""
Rejoining the trim and atrim from above with concat
"""
pre = ""
for i in range(n_chunks):
pre += f"[{i}v][{i}a]"
pre_concat=f"{pre}concat=n={n_chunks}:v=1:a=1[outv][outa]"
return pre_concat
def make_filtergraph(scene_ranges: list[tuple[str, str]]):
"""Generate the several lines of text for filtergraph"""
inputs = [make_filtergraph_pieces(scene_range=e, chunk=i) for i, e in enumerate(scene_ranges)]
inputs = "".join(inputs)
output_graph = make_concat_filtergraph(len(scene_ranges))
s = inputs + output_graph
return s
Example Outputs
Parsing Annotation
command = "1-3,7-23,25...myvid"
parsed = parse_scene_annotation(command)
parsed
>>> [(1, 3), (7, 23), (25, 25)], 'myvid']
Getting Timestamps
ts = parsed_scenes(parsed[0], df)
ts
>>> ([('0.0000', '8.9170'), ('391.7830', '406.8830'), ('1001.5670', '1005.7000')], '0.0000')
Getting Filtergraph
fg = make_filtergraph(ts)
fg
>>> '[0:v]trim=start=0.0000:end=8.9170,setpts=PTS-STARTPTS[0v];[0:a]atrim=start=0.0000:end=8.9170,asetpts=PTS-STARTPTS[0a];[0:v]trim=start=391.7830:end=406.8830,setpts=PTS-STARTPTS[1v];[0:a]atrim=start=391.7830:end=406.8830,asetpts=PTS-STARTPTS[1a];[0:v]trim=start=1001.5670:end=1005.7000,setpts=PTS-STARTPTS[2v];[0:a]atrim=start=1001.5670:end=1005.7000,asetpts=PTS-STARTPTS[2a];[0v][0a][1v][1a][2v][2a]concat=n=3:v=1:a=1[outv][outa]'
Chaining It Together
def parse_command(command: str, src_video: str, video_format: str, output_folder: str, df_scenes: pd.DataFrame):
scene_ranges, name = parse_scene_annotation(command)
scene_ts, ss = parse_scenes(scene_ranges, df_scenes)
fg = make_filtergraph(scene_ts)
output_file = str((Path(output_folder).expanduser() / name).with_suffix(video_format))
args = ['ffmpeg', '-i', src_video, '-filter_complex',
fg, "-map", "[outv]", "-map", "[outa]", "-c:v", "h264",
"-preset", "slow", "-movflags", "+faststart", "-fps_mode", "passthrough",
output_file]
return args
Ready
from subprocess import Popen, STDOUT, PIPE
from tqdm.auto import tqdm
commands = [
"1-3,7...myvid",
"13,16-22...my_over_vid"
]
for command in tqdm(commands):
parsed_command = parse_command(command, src_video="/path/to/src.mp4",
video_format=".mp4",
output_folder="/path/to/converted",
df_scenes=df
)
with Popen(parsed_command) as p:
p.pcommunicate()