Shortcuts

Source code for torchcodec._core._metadata

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import dataclasses
import json
import pathlib
from dataclasses import dataclass
from typing import List, Optional, Union

import torch

from torchcodec._core.ops import (
    _get_container_json_metadata,
    _get_stream_json_metadata,
    create_from_file,
)


SPACES = "  "


@dataclass
class StreamMetadata:
    duration_seconds_from_header: Optional[float]
    """Duration of the stream, in seconds, obtained from the header (float or
    None). This could be inaccurate."""
    begin_stream_seconds_from_header: Optional[float]
    """Beginning of the stream, in seconds, obtained from the header (float or
    None). Usually, this is equal to 0."""
    bit_rate: Optional[float]
    """Bit rate of the stream, in seconds (float or None)."""
    codec: Optional[str]
    """Codec (str or None)."""
    stream_index: int
    """Index of the stream that this metadata refers to (int)."""

    def __repr__(self):
        s = self.__class__.__name__ + ":\n"
        for field in dataclasses.fields(self):
            s += f"{SPACES}{field.name}: {getattr(self, field.name)}\n"
        return s


[docs]@dataclass class VideoStreamMetadata(StreamMetadata): """Metadata of a single video stream.""" begin_stream_seconds_from_content: Optional[float] """Beginning of the stream, in seconds (float or None). Conceptually, this corresponds to the first frame's :term:`pts`. It is only computed when a :term:`scan` is done as min(frame.pts) across all frames in the stream. Usually, this is equal to 0.""" end_stream_seconds_from_content: Optional[float] """End of the stream, in seconds (float or None). Conceptually, this corresponds to last_frame.pts + last_frame.duration. It is only computed when a :term:`scan` is done as max(frame.pts + frame.duration) across all frames in the stream. Note that no frame is played at this time value, so calling :meth:`~torchcodec.decoders.VideoDecoder.get_frame_played_at` with this value would result in an error. Retrieving the last frame is best done by simply indexing the :class:`~torchcodec.decoders.VideoDecoder` object with ``[-1]``. """ width: Optional[int] """Width of the frames (int or None).""" height: Optional[int] """Height of the frames (int or None).""" num_frames_from_header: Optional[int] """Number of frames, from the stream's metadata. This is potentially inaccurate. We recommend using the ``num_frames`` attribute instead. (int or None).""" num_frames_from_content: Optional[int] """Number of frames computed by TorchCodec by scanning the stream's content (the scan doesn't involve decoding). This is more accurate than ``num_frames_from_header``. We recommend using the ``num_frames`` attribute instead. (int or None).""" average_fps_from_header: Optional[float] """Averate fps of the stream, obtained from the header (float or None). We recommend using the ``average_fps`` attribute instead.""" @property def duration_seconds(self) -> Optional[float]: """Duration of the stream in seconds. We try to calculate the duration from the actual frames if a :term:`scan` was performed. Otherwise we fall back to ``duration_seconds_from_header``. """ if ( self.end_stream_seconds_from_content is None or self.begin_stream_seconds_from_content is None ): return self.duration_seconds_from_header return ( self.end_stream_seconds_from_content - self.begin_stream_seconds_from_content ) @property def begin_stream_seconds(self) -> float: """Beginning of the stream, in seconds (float). Conceptually, this corresponds to the first frame's :term:`pts`. If ``begin_stream_seconds_from_content`` is not None, then it is returned. Otherwise, this value is 0. """ if self.begin_stream_seconds_from_content is None: return 0 else: return self.begin_stream_seconds_from_content @property def end_stream_seconds(self) -> Optional[float]: """End of the stream, in seconds (float or None). Conceptually, this corresponds to last_frame.pts + last_frame.duration. If ``end_stream_seconds_from_content`` is not None, then that value is returned. Otherwise, returns ``duration_seconds``. """ if self.end_stream_seconds_from_content is None: return self.duration_seconds else: return self.end_stream_seconds_from_content @property def num_frames(self) -> Optional[int]: """Number of frames in the stream. This corresponds to ``num_frames_from_content`` if a :term:`scan` was made, otherwise it corresponds to ``num_frames_from_header``. """ if self.num_frames_from_content is not None: return self.num_frames_from_content else: return self.num_frames_from_header @property def average_fps(self) -> Optional[float]: """Average fps of the stream. If a :term:`scan` was perfomed, this is computed from the number of frames and the duration of the stream. Otherwise we fall back to ``average_fps_from_header``. """ if ( self.end_stream_seconds_from_content is None or self.begin_stream_seconds_from_content is None or self.num_frames is None ): return self.average_fps_from_header return self.num_frames / ( self.end_stream_seconds_from_content - self.begin_stream_seconds_from_content ) def __repr__(self): s = super().__repr__() s += f"{SPACES}duration_seconds: {self.duration_seconds}\n" s += f"{SPACES}begin_stream_seconds: {self.begin_stream_seconds}\n" s += f"{SPACES}end_stream_seconds: {self.end_stream_seconds}\n" s += f"{SPACES}num_frames: {self.num_frames}\n" s += f"{SPACES}average_fps: {self.average_fps}\n" return s
[docs]@dataclass class AudioStreamMetadata(StreamMetadata): """Metadata of a single audio stream.""" sample_rate: Optional[int] """The original sample rate.""" num_channels: Optional[int] """The number of channels (1 for mono, 2 for stereo, etc.)""" sample_format: Optional[str] """The original sample format, as described by FFmpeg. E.g. 'fltp', 's32', etc.""" def __repr__(self): return super().__repr__()
@dataclass class ContainerMetadata: duration_seconds_from_header: Optional[float] bit_rate_from_header: Optional[float] best_video_stream_index: Optional[int] best_audio_stream_index: Optional[int] streams: List[StreamMetadata] @property def duration_seconds(self) -> Optional[float]: raise NotImplementedError("Decide on logic and implement this!") @property def bit_rate(self) -> Optional[float]: raise NotImplementedError("Decide on logic and implement this!") @property def best_video_stream(self) -> VideoStreamMetadata: if self.best_video_stream_index is None: raise ValueError("The best video stream is unknown.") metadata = self.streams[self.best_video_stream_index] assert isinstance(metadata, VideoStreamMetadata) # mypy <3 return metadata @property def best_audio_stream(self) -> AudioStreamMetadata: if self.best_audio_stream_index is None: raise ValueError("The best audio stream is unknown.") metadata = self.streams[self.best_audio_stream_index] assert isinstance(metadata, AudioStreamMetadata) # mypy <3 return metadata # TODO-AUDIO: This is user-facing. Should this just be `get_metadata`, without # the "container" name in it? Same below. def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata: """Return container metadata from a decoder. The accuracy of the metadata and the availability of some returned fields depends on whether a full scan was performed by the decoder. """ container_dict = json.loads(_get_container_json_metadata(decoder)) streams_metadata: List[StreamMetadata] = [] for stream_index in range(container_dict["numStreams"]): stream_dict = json.loads(_get_stream_json_metadata(decoder, stream_index)) common_meta = dict( duration_seconds_from_header=stream_dict.get("durationSeconds"), bit_rate=stream_dict.get("bitRate"), begin_stream_seconds_from_header=stream_dict.get("beginStreamFromHeader"), codec=stream_dict.get("codec"), stream_index=stream_index, ) if stream_dict["mediaType"] == "video": streams_metadata.append( VideoStreamMetadata( begin_stream_seconds_from_content=stream_dict.get( "minPtsSecondsFromScan" ), end_stream_seconds_from_content=stream_dict.get( "maxPtsSecondsFromScan" ), width=stream_dict.get("width"), height=stream_dict.get("height"), num_frames_from_header=stream_dict.get("numFrames"), num_frames_from_content=stream_dict.get("numFramesFromScan"), average_fps_from_header=stream_dict.get("averageFps"), **common_meta, ) ) elif stream_dict["mediaType"] == "audio": streams_metadata.append( AudioStreamMetadata( sample_rate=stream_dict.get("sampleRate"), num_channels=stream_dict.get("numChannels"), sample_format=stream_dict.get("sampleFormat"), **common_meta, ) ) else: # This is neither a video nor audio stream. Could be e.g. subtitles. # We still need to add a dummy entry so that len(streams_metadata) # is consistent with the number of streams. streams_metadata.append(StreamMetadata(**common_meta)) return ContainerMetadata( duration_seconds_from_header=container_dict.get("durationSeconds"), bit_rate_from_header=container_dict.get("bitRate"), best_video_stream_index=container_dict.get("bestVideoStreamIndex"), best_audio_stream_index=container_dict.get("bestAudioStreamIndex"), streams=streams_metadata, ) def get_container_metadata_from_header( filename: Union[str, pathlib.Path] ) -> ContainerMetadata: return get_container_metadata( create_from_file(str(filename), seek_mode="approximate") )

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources