Remove Commercials with ffmpeg and PySceneDetect

Managing non-contiguous sections

Created: 2022-01-11

Updated: 2022-08-19

Background

I wanted to remove commercials from several hundred TV episodes present in a handful of recordings. Additionally, I wanted to split them per episode to be added to a media library.

Scene Detection

Detecting the transition from content to commercial is made easy using PySceneDetect.

The default settings were nearly perfect at detecting the scene transition.

Usage

PySceneDetect is easily called from the command line. However, as I was doing this in batches and wanted a way to incorporate reviewing the scenes and episode annotation with the conversion, I decided to use a Jupyter Notebook

from subprocess import Popen

def get_scenes(folder, video):
    args = ["scenedetect", "-i", video, "detect-threshold", "list-scenes", "save-images", "export-html", "-w",
           "320", "-h", "180"]
    with Popen(args, cwd=folder) as p:
        p.communicate()

As you see from the above function, PySceneDetect will generate a report of scenes detected, along with thumbnails to confirm their accuracy.

With the report finished - I use some helper functions to work with the report and integrate it into Jupyter.

Annotating Scenes

To assign scenes to a video output, I came up with a simple syntax:

scene_stmt: number
scene_range: number-number (inclusive)
scene_break: number,number
output_name: ...string

Example:

Select scenes 1-4 (inclusive), scene 7, and scenes 8-11 (inclusive) and save to a video named 'output'

annotation = "1-4,7,8-11...output"

Annotation Parser

def parse_scene_annotation(s: str) -> list[list[tuple[int, int]], str]:
    """
    Transform annotation to structured form
    """
    scenes_str, output_name = s.split("...")
    scenes = []
    for sc_range in scenes_str.split(","):
        # Split on disjointed scenes
        sc_range = sc_range.strip()
        if "-" not in sc_range:
            # Single scene
            scenes.append((int(sc_range), int(sc_range)))
            continue
        start, _, end = sc_range.partition("-")
        scenes.append((int(start.strip()), int(end.strip())))
    return [scenes, output_name]

Translate Scene Numbers to Timestamps

We will use the report from PySceneDetect to get translate scene numbers to timestamps

import pandas as pd
from pathlib import Path

scene_report_fp = Path("~/Videos/Captures/Scenes.csv").expanduser()  # Scene Report
df = pd.read_csv(scene_report_fp, skiprows=1, index_col=['Scene Number'])

def get_scene_timestamp(scene_number: int, is_end: bool, df_scenes: pd.DataFrame) -> float:
    """Depending on if this scene_number starts a range or scene_ranges it return the appropriate timestamp"""
    return df_scenes.at[scene_number, f"{'End' if is_end else 'Start'} Time (seconds)"]

def get_scene_timestamps(ends, df_scenes: pd.DataFrame):
    scene_times = []
    for start, end in ends:
        start_secs = get_scene_timestamp(start, is_end=False, df_scenes=df_scenes)
        end_secs = get_scene_timestamp(start, is_end=True, df_scenes=df_scenes)
        scene_times.append((start_secs, end_secs))
    return scene_times

Build Filtergraph

We convert scene numbers to timestamps and prepare them for ingestion with some ffmpeg specific logic

from itertools import chain

def parse_scenes(scene_ranges: list[tuple[int, int]], df_scenes: pd.DataFrame, fast_seek_buffer: int = 100) -> tuple[
    list[tuple[str, str]], str]:
    """
    Fast seeking is not exact, so we want to start transcoding before the exact timestamp.
    This requires recalculating our timestamps to adjust for the buffer.

    Note that ffmpeg will discard the buffered portion, and will *not* be included in the output

    We also want to convert our timestamps to strings, with .4f precision
    """

    ends = get_scene_timestamps(scene_ranges, df_scenes=df_scenes)
    
    ss_start = min(chain.from_iterable(ends)) - fast_seek_buffer
    ss_start = max([ss_start, 0])
    new_pieces: list[tuple[str, str]] = []
    for (start, end) in ends:
        new_start = start - ss_start
        new_end = end - ss_start
        start_code = "%.4f" % new_start
        end_code = "%.4f" % new_end
        new_pieces.append((start_code, end_code))
    ss_start_code = "%.4f" % ss_start
    return new_pieces, ss_start_code

Hooking in ffmpeg

ffmpeg can do this, but the filtergraph is very verbose!

Fortunately, we can build this programmatically

def make_filtergraph_pieces(scene_range: tuple[str, str], chunk: int):
    """
    Trimming video and audio and setting the correcting the timestamp
    """
    v = f"[0:v]trim=start={scene_range[0]}:end={scene_range[1]},setpts=PTS-STARTPTS[{chunk}v];"
    a = f"[0:a]atrim=start={scene_range[0]}:end={scene_range[1]},asetpts=PTS-STARTPTS[{chunk}a];"
    return v+a

def make_concat_filtergraph(n_chunks: int):
    """
    Rejoining the trim and atrim from above with concat
    """
    pre = ""
    for i in range(n_chunks):
        pre += f"[{i}v][{i}a]"
    pre_concat=f"{pre}concat=n={n_chunks}:v=1:a=1[outv][outa]"
    return pre_concat

def make_filtergraph(scene_ranges: list[tuple[str, str]]):
    """Generate the several lines of text for filtergraph"""
    inputs = [make_filtergraph_pieces(scene_range=e, chunk=i) for i, e in enumerate(scene_ranges)]
    inputs = "".join(inputs)
    output_graph = make_concat_filtergraph(len(scene_ranges))
    s =  inputs + output_graph
    return s

Example Outputs

Parsing Annotation

command = "1-3,7-23,25...myvid"
parsed = parse_scene_annotation(command)
parsed
>>> [(1, 3), (7, 23), (25, 25)], 'myvid']

Getting Timestamps

ts = parsed_scenes(parsed[0], df)
ts
>>> ([('0.0000', '8.9170'), ('391.7830', '406.8830'), ('1001.5670', '1005.7000')], '0.0000')

Getting Filtergraph

fg = make_filtergraph(ts)
fg
>>> '[0:v]trim=start=0.0000:end=8.9170,setpts=PTS-STARTPTS[0v];[0:a]atrim=start=0.0000:end=8.9170,asetpts=PTS-STARTPTS[0a];[0:v]trim=start=391.7830:end=406.8830,setpts=PTS-STARTPTS[1v];[0:a]atrim=start=391.7830:end=406.8830,asetpts=PTS-STARTPTS[1a];[0:v]trim=start=1001.5670:end=1005.7000,setpts=PTS-STARTPTS[2v];[0:a]atrim=start=1001.5670:end=1005.7000,asetpts=PTS-STARTPTS[2a];[0v][0a][1v][1a][2v][2a]concat=n=3:v=1:a=1[outv][outa]'

Chaining It Together

def parse_command(command: str, src_video: str, video_format: str, output_folder: str, df_scenes: pd.DataFrame):
    scene_ranges, name = parse_scene_annotation(command)
    scene_ts, ss = parse_scenes(scene_ranges, df_scenes)
    fg = make_filtergraph(scene_ts)
    output_file = str((Path(output_folder).expanduser() / name).with_suffix(video_format))
    
    args = ['ffmpeg', '-i', src_video, '-filter_complex',
           fg, "-map", "[outv]", "-map", "[outa]", "-c:v", "h264",
            "-preset", "slow", "-movflags", "+faststart", "-fps_mode", "passthrough",
           output_file]
    return args

Ready

from subprocess import Popen, STDOUT, PIPE
from tqdm.auto import tqdm

commands = [
    "1-3,7...myvid",
    "13,16-22...my_over_vid"
]

for command in tqdm(commands):
    parsed_command = parse_command(command, src_video="/path/to/src.mp4",
                                   video_format=".mp4",
                                   output_folder="/path/to/converted",
                                   df_scenes=df
                                  )
    with Popen(parsed_command) as p:
        p.pcommunicate()

My Notes