Source code for torchcodec.decoders._core._metadata
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import dataclasses
import json
import pathlib
from dataclasses import dataclass
from typing import List, Optional, Union
import torch
from torchcodec.decoders._core.video_decoder_ops import (
_get_container_json_metadata,
_get_stream_json_metadata,
create_from_file,
)
[docs]@dataclass
class VideoStreamMetadata:
"""Metadata of a single video stream."""
duration_seconds_from_header: Optional[float]
"""Duration of the stream, in seconds, obtained from the header (float or
None). This could be inaccurate."""
bit_rate: Optional[float]
"""Bit rate of the stream, in seconds (float or None)."""
num_frames_from_header: Optional[int]
"""Number of frames, from the stream's metadata. This is potentially
inaccurate. We recommend using the ``num_frames`` attribute instead.
(int or None)."""
num_frames_from_content: Optional[int]
"""Number of frames computed by TorchCodec by scanning the stream's
content (the scan doesn't involve decoding). This is more accurate
than ``num_frames_from_header``. We recommend using the
``num_frames`` attribute instead. (int or None)."""
begin_stream_seconds_from_content: Optional[float]
"""Beginning of the stream, in seconds (float or None).
Conceptually, this corresponds to the first frame's :term:`pts`. It is
computed as min(frame.pts) across all frames in the stream. Usually, this is
equal to 0."""
end_stream_seconds_from_content: Optional[float]
"""End of the stream, in seconds (float or None).
Conceptually, this corresponds to last_frame.pts + last_frame.duration. It
is computed as max(frame.pts + frame.duration) across all frames in the
stream. Note that no frame is played at this time value, so calling
:meth:`~torchcodec.decoders.VideoDecoder.get_frame_played_at` with
this value would result in an error. Retrieving the last frame is best done
by simply indexing the :class:`~torchcodec.decoders.VideoDecoder`
object with ``[-1]``.
"""
codec: Optional[str]
"""Codec (str or None)."""
width: Optional[int]
"""Width of the frames (int or None)."""
height: Optional[int]
"""Height of the frames (int or None)."""
average_fps_from_header: Optional[float]
"""Averate fps of the stream, obtained from the header (float or None).
We recommend using the ``average_fps`` attribute instead."""
stream_index: int
"""Index of the stream within the video (int)."""
@property
def num_frames(self) -> Optional[int]:
"""Number of frames in the stream. This corresponds to
``num_frames_from_content`` if a :term:`scan` was made, otherwise it
corresponds to ``num_frames_from_header``.
"""
if self.num_frames_from_content is not None:
return self.num_frames_from_content
else:
return self.num_frames_from_header
@property
def duration_seconds(self) -> Optional[float]:
"""Duration of the stream in seconds. We try to calculate the duration
from the actual frames if a :term:`scan` was performed. Otherwise we
fall back to ``duration_seconds_from_header``.
"""
if (
self.end_stream_seconds_from_content is None
or self.begin_stream_seconds_from_content is None
):
return self.duration_seconds_from_header
return (
self.end_stream_seconds_from_content
- self.begin_stream_seconds_from_content
)
@property
def average_fps(self) -> Optional[float]:
"""Average fps of the stream. If a :term:`scan` was perfomed, this is
computed from the number of frames and the duration of the stream.
Otherwise we fall back to ``average_fps_from_header``.
"""
if (
self.end_stream_seconds_from_content is None
or self.begin_stream_seconds_from_content is None
or self.num_frames is None
):
return self.average_fps_from_header
return self.num_frames / (
self.end_stream_seconds_from_content
- self.begin_stream_seconds_from_content
)
@property
def begin_stream_seconds(self) -> float:
"""Beginning of the stream, in seconds (float). Conceptually, this
corresponds to the first frame's :term:`pts`. If
``begin_stream_seconds_from_content`` is not None, then it is returned.
Otherwise, this value is 0.
"""
if self.begin_stream_seconds_from_content is None:
return 0
else:
return self.begin_stream_seconds_from_content
@property
def end_stream_seconds(self) -> Optional[float]:
"""End of the stream, in seconds (float or None).
Conceptually, this corresponds to last_frame.pts + last_frame.duration.
If ``end_stream_seconds_from_content`` is not None, then that value is
returned. Otherwise, returns ``duration_seconds``.
"""
if self.end_stream_seconds_from_content is None:
return self.duration_seconds
else:
return self.end_stream_seconds_from_content
def __repr__(self):
# Overridden because properites are not printed by default.
s = self.__class__.__name__ + ":\n"
spaces = " "
s += f"{spaces}num_frames: {self.num_frames}\n"
s += f"{spaces}duration_seconds: {self.duration_seconds}\n"
s += f"{spaces}average_fps: {self.average_fps}\n"
for field in dataclasses.fields(self):
s += f"{spaces}{field.name}: {getattr(self, field.name)}\n"
return s
@dataclass
class VideoMetadata:
duration_seconds_from_header: Optional[float]
bit_rate_from_header: Optional[float]
best_video_stream_index: Optional[int]
best_audio_stream_index: Optional[int]
streams: List[VideoStreamMetadata]
@property
def duration_seconds(self) -> Optional[float]:
raise NotImplementedError("Decide on logic and implement this!")
@property
def bit_rate(self) -> Optional[float]:
raise NotImplementedError("Decide on logic and implement this!")
@property
def best_video_stream(self) -> VideoStreamMetadata:
if self.best_video_stream_index is None:
raise ValueError("The best video stream is unknown.")
return self.streams[self.best_video_stream_index]
def get_video_metadata(decoder: torch.Tensor) -> VideoMetadata:
"""Return video metadata from a video decoder.
The accuracy of the metadata and the availability of some returned fields
depends on whether a full scan was performed by the decoder.
"""
container_dict = json.loads(_get_container_json_metadata(decoder))
streams_metadata = []
for stream_index in range(container_dict["numStreams"]):
stream_dict = json.loads(_get_stream_json_metadata(decoder, stream_index))
streams_metadata.append(
VideoStreamMetadata(
duration_seconds_from_header=stream_dict.get("durationSeconds"),
bit_rate=stream_dict.get("bitRate"),
num_frames_from_header=stream_dict.get("numFrames"),
num_frames_from_content=stream_dict.get("numFramesFromScan"),
begin_stream_seconds_from_content=stream_dict.get(
"minPtsSecondsFromScan"
),
end_stream_seconds_from_content=stream_dict.get(
"maxPtsSecondsFromScan"
),
codec=stream_dict.get("codec"),
width=stream_dict.get("width"),
height=stream_dict.get("height"),
average_fps_from_header=stream_dict.get("averageFps"),
stream_index=stream_index,
)
)
return VideoMetadata(
duration_seconds_from_header=container_dict.get("durationSeconds"),
bit_rate_from_header=container_dict.get("bitRate"),
best_video_stream_index=container_dict.get("bestVideoStreamIndex"),
best_audio_stream_index=container_dict.get("bestAudioStreamIndex"),
streams=streams_metadata,
)
def get_video_metadata_from_header(filename: Union[str, pathlib.Path]) -> VideoMetadata:
return get_video_metadata(create_from_file(str(filename), seek_mode="approximate"))