Source code for torchcodec._core._metadata

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import dataclasses
import json
import pathlib
from dataclasses import dataclass
from typing import List, Optional, Union

import torch

from torchcodec._core.ops import (
    _get_container_json_metadata,
    _get_stream_json_metadata,
    create_from_file,
)


SPACES = "  "


@dataclass
class StreamMetadata:
    duration_seconds_from_header: Optional[float]
    """Duration of the stream, in seconds, obtained from the header (float or
    None). This could be inaccurate."""
    begin_stream_seconds_from_header: Optional[float]
    """Beginning of the stream, in seconds, obtained from the header (float or
    None). Usually, this is equal to 0."""
    bit_rate: Optional[float]
    """Bit rate of the stream, in seconds (float or None)."""
    codec: Optional[str]
    """Codec (str or None)."""
    stream_index: int
    """Index of the stream that this metadata refers to (int)."""

    def __repr__(self):
        s = self.__class__.__name__ + ":\n"
        for field in dataclasses.fields(self):
            s += f"{SPACES}{field.name}: {getattr(self, field.name)}\n"
        return s


[docs]@dataclass
class VideoStreamMetadata(StreamMetadata):
    """Metadata of a single video stream."""

    begin_stream_seconds_from_content: Optional[float]
    """Beginning of the stream, in seconds (float or None).
    Conceptually, this corresponds to the first frame's :term:`pts`. It is only
    computed when a :term:`scan` is done as min(frame.pts) across all frames in
    the stream. Usually, this is equal to 0."""
    end_stream_seconds_from_content: Optional[float]
    """End of the stream, in seconds (float or None).
    Conceptually, this corresponds to last_frame.pts + last_frame.duration. It
    is only computed when a :term:`scan` is done as max(frame.pts +
    frame.duration) across all frames in the stream. Note that no frame is
    played at this time value, so calling
    :meth:`~torchcodec.decoders.VideoDecoder.get_frame_played_at` with this
    value would result in an error. Retrieving the last frame is best done by
    simply indexing the :class:`~torchcodec.decoders.VideoDecoder` object with
    ``[-1]``.
    """
    width: Optional[int]
    """Width of the frames (int or None)."""
    height: Optional[int]
    """Height of the frames (int or None)."""
    num_frames_from_header: Optional[int]
    """Number of frames, from the stream's metadata. This is potentially
    inaccurate. We recommend using the ``num_frames`` attribute instead.
    (int or None)."""
    num_frames_from_content: Optional[int]
    """Number of frames computed by TorchCodec by scanning the stream's
    content (the scan doesn't involve decoding). This is more accurate
    than ``num_frames_from_header``. We recommend using the
    ``num_frames`` attribute instead. (int or None)."""
    average_fps_from_header: Optional[float]
    """Averate fps of the stream, obtained from the header (float or None).
    We recommend using the ``average_fps`` attribute instead."""

    @property
    def duration_seconds(self) -> Optional[float]:
        """Duration of the stream in seconds. We try to calculate the duration
        from the actual frames if a :term:`scan` was performed. Otherwise we
        fall back to ``duration_seconds_from_header``.
        """
        if (
            self.end_stream_seconds_from_content is None
            or self.begin_stream_seconds_from_content is None
        ):
            return self.duration_seconds_from_header
        return (
            self.end_stream_seconds_from_content
            - self.begin_stream_seconds_from_content
        )

    @property
    def begin_stream_seconds(self) -> float:
        """Beginning of the stream, in seconds (float). Conceptually, this
        corresponds to the first frame's :term:`pts`. If
        ``begin_stream_seconds_from_content`` is not None, then it is returned.
        Otherwise, this value is 0.
        """
        if self.begin_stream_seconds_from_content is None:
            return 0
        else:
            return self.begin_stream_seconds_from_content

    @property
    def end_stream_seconds(self) -> Optional[float]:
        """End of the stream, in seconds (float or None).
        Conceptually, this corresponds to last_frame.pts + last_frame.duration.
        If ``end_stream_seconds_from_content`` is not None, then that value is
        returned. Otherwise, returns ``duration_seconds``.
        """
        if self.end_stream_seconds_from_content is None:
            return self.duration_seconds
        else:
            return self.end_stream_seconds_from_content

    @property
    def num_frames(self) -> Optional[int]:
        """Number of frames in the stream. This corresponds to
        ``num_frames_from_content`` if a :term:`scan` was made, otherwise it
        corresponds to ``num_frames_from_header``.
        """
        if self.num_frames_from_content is not None:
            return self.num_frames_from_content
        else:
            return self.num_frames_from_header

    @property
    def average_fps(self) -> Optional[float]:
        """Average fps of the stream. If a :term:`scan` was perfomed, this is
        computed from the number of frames and the duration of the stream.
        Otherwise we fall back to ``average_fps_from_header``.
        """
        if (
            self.end_stream_seconds_from_content is None
            or self.begin_stream_seconds_from_content is None
            or self.num_frames is None
        ):
            return self.average_fps_from_header
        return self.num_frames / (
            self.end_stream_seconds_from_content
            - self.begin_stream_seconds_from_content
        )

    def __repr__(self):
        s = super().__repr__()
        s += f"{SPACES}duration_seconds: {self.duration_seconds}\n"
        s += f"{SPACES}begin_stream_seconds: {self.begin_stream_seconds}\n"
        s += f"{SPACES}end_stream_seconds: {self.end_stream_seconds}\n"
        s += f"{SPACES}num_frames: {self.num_frames}\n"
        s += f"{SPACES}average_fps: {self.average_fps}\n"
        return s


[docs]@dataclass
class AudioStreamMetadata(StreamMetadata):
    """Metadata of a single audio stream."""

    sample_rate: Optional[int]
    """The original sample rate."""
    num_channels: Optional[int]
    """The number of channels (1 for mono, 2 for stereo, etc.)"""
    sample_format: Optional[str]
    """The original sample format, as described by FFmpeg. E.g. 'fltp', 's32', etc."""

    def __repr__(self):
        return super().__repr__()


@dataclass
class ContainerMetadata:
    duration_seconds_from_header: Optional[float]
    bit_rate_from_header: Optional[float]
    best_video_stream_index: Optional[int]
    best_audio_stream_index: Optional[int]

    streams: List[StreamMetadata]

    @property
    def duration_seconds(self) -> Optional[float]:
        raise NotImplementedError("Decide on logic and implement this!")

    @property
    def bit_rate(self) -> Optional[float]:
        raise NotImplementedError("Decide on logic and implement this!")

    @property
    def best_video_stream(self) -> VideoStreamMetadata:
        if self.best_video_stream_index is None:
            raise ValueError("The best video stream is unknown.")
        metadata = self.streams[self.best_video_stream_index]
        assert isinstance(metadata, VideoStreamMetadata)  # mypy <3
        return metadata

    @property
    def best_audio_stream(self) -> AudioStreamMetadata:
        if self.best_audio_stream_index is None:
            raise ValueError("The best audio stream is unknown.")
        metadata = self.streams[self.best_audio_stream_index]
        assert isinstance(metadata, AudioStreamMetadata)  # mypy <3
        return metadata


# TODO-AUDIO: This is user-facing. Should this just be `get_metadata`, without
# the "container" name in it? Same below.
def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
    """Return container metadata from a decoder.

    The accuracy of the metadata and the availability of some returned fields
    depends on whether a full scan was performed by the decoder.
    """

    container_dict = json.loads(_get_container_json_metadata(decoder))
    streams_metadata: List[StreamMetadata] = []
    for stream_index in range(container_dict["numStreams"]):
        stream_dict = json.loads(_get_stream_json_metadata(decoder, stream_index))
        common_meta = dict(
            duration_seconds_from_header=stream_dict.get("durationSeconds"),
            bit_rate=stream_dict.get("bitRate"),
            begin_stream_seconds_from_header=stream_dict.get("beginStreamFromHeader"),
            codec=stream_dict.get("codec"),
            stream_index=stream_index,
        )
        if stream_dict["mediaType"] == "video":
            streams_metadata.append(
                VideoStreamMetadata(
                    begin_stream_seconds_from_content=stream_dict.get(
                        "minPtsSecondsFromScan"
                    ),
                    end_stream_seconds_from_content=stream_dict.get(
                        "maxPtsSecondsFromScan"
                    ),
                    width=stream_dict.get("width"),
                    height=stream_dict.get("height"),
                    num_frames_from_header=stream_dict.get("numFrames"),
                    num_frames_from_content=stream_dict.get("numFramesFromScan"),
                    average_fps_from_header=stream_dict.get("averageFps"),
                    **common_meta,
                )
            )
        elif stream_dict["mediaType"] == "audio":
            streams_metadata.append(
                AudioStreamMetadata(
                    sample_rate=stream_dict.get("sampleRate"),
                    num_channels=stream_dict.get("numChannels"),
                    sample_format=stream_dict.get("sampleFormat"),
                    **common_meta,
                )
            )
        else:
            # This is neither a video nor audio stream. Could be e.g. subtitles.
            # We still need to add a dummy entry so that len(streams_metadata)
            # is consistent with the number of streams.
            streams_metadata.append(StreamMetadata(**common_meta))

    return ContainerMetadata(
        duration_seconds_from_header=container_dict.get("durationSeconds"),
        bit_rate_from_header=container_dict.get("bitRate"),
        best_video_stream_index=container_dict.get("bestVideoStreamIndex"),
        best_audio_stream_index=container_dict.get("bestAudioStreamIndex"),
        streams=streams_metadata,
    )


def get_container_metadata_from_header(
    filename: Union[str, pathlib.Path]
) -> ContainerMetadata:
    return get_container_metadata(
        create_from_file(str(filename), seek_mode="approximate")
    )
Source code for torchcodec._core._metadata

Docs

Tutorials

Resources