Shortcuts

Source code for torch_tensorrt.dynamo._settings

from dataclasses import dataclass, field
from typing import Any, Collection, Optional, Set, Tuple, Union

import tensorrt as trt
import torch
from torch.fx.node import Target
from torch_tensorrt._Device import Device
from torch_tensorrt._enums import EngineCapability, dtype
from torch_tensorrt.dynamo._defaults import (
    ASSUME_DYNAMIC_SHAPE_SUPPORT,
    AUTOCAST_CALIBRATION_DATALOADER,
    AUTOCAST_EXCLUDED_NODES,
    AUTOCAST_EXCLUDED_OPS,
    AUTOCAST_LOW_PRECISION_TYPE,
    AUTOCAST_MAX_DEPTH_OF_REDUCTION,
    AUTOCAST_MAX_OUTPUT_THRESHOLD,
    CACHE_BUILT_ENGINES,
    CPU_MEMORY_BUDGET,
    DISABLE_TF32,
    DLA_GLOBAL_DRAM_SIZE,
    DLA_LOCAL_DRAM_SIZE,
    DLA_SRAM_SIZE,
    DRYRUN,
    ENABLE_AUTOCAST,
    ENABLE_CROSS_COMPILE_FOR_WINDOWS,
    ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
    ENABLE_RESOURCE_PARTITIONING,
    ENABLE_WEIGHT_STREAMING,
    ENABLED_PRECISIONS,
    ENGINE_CAPABILITY,
    HARDWARE_COMPATIBLE,
    IMMUTABLE_WEIGHTS,
    L2_LIMIT_FOR_TILING,
    LAZY_ENGINE_INIT,
    MAX_AUX_STREAMS,
    MIN_BLOCK_SIZE,
    NUM_AVG_TIMING_ITERS,
    OFFLOAD_MODULE_TO_CPU,
    OPTIMIZATION_LEVEL,
    PASS_THROUGH_BUILD_FAILURES,
    REFIT_IDENTICAL_ENGINE_WEIGHTS,
    REQUIRE_FULL_COMPILATION,
    REUSE_CACHED_ENGINES,
    SPARSE_WEIGHTS,
    STRIP_ENGINE_WEIGHTS,
    TILING_OPTIMIZATION_LEVEL,
    TIMING_CACHE_PATH,
    TRUNCATE_DOUBLE,
    USE_DISTRIBUTED_MODE_TRACE,
    USE_EXPLICIT_TYPING,
    USE_FAST_PARTITIONER,
    USE_FP32_ACC,
    USE_PYTHON_RUNTIME,
    VERSION_COMPATIBLE,
    WORKSPACE_SIZE,
    default_device,
)


[docs]@dataclass class CompilationSettings: """Compilation settings for Torch-TensorRT Dynamo Paths Args: enabled_precisions (Set[dtype]): Available kernel dtype precisions debug (bool): Whether to print out verbose debugging information workspace_size (int): Workspace TRT is allowed to use for the module (0 is default) min_block_size (int): Minimum number of operators per TRT-Engine Block torch_executed_ops (Collection[Target]): Collection of operations to run in Torch, regardless of converter coverage pass_through_build_failures (bool): Whether to fail on TRT engine build errors (True) or not (False) max_aux_streams (Optional[int]): Maximum number of allowed auxiliary TRT streams for each engine version_compatible (bool): Provide version forward-compatibility for engine plan files optimization_level (Optional[int]): Builder optimization 0-5, higher levels imply longer build time, searching for more optimization options. TRT defaults to 3 use_python_runtime (Optional[bool]): Whether to strictly use Python runtime or C++ runtime. To auto-select a runtime based on C++ dependency presence (preferentially choosing C++ runtime if available), leave the argument as None truncate_double (bool): Whether to truncate float64 TRT engine inputs or weights to float32 use_fast_partitioner (bool): Whether to use the fast or global graph partitioning system enable_experimental_decompositions (bool): Whether to enable all core aten decompositions or only a selected subset of them device (Device): GPU to compile the model on require_full_compilation (bool): Whether to require the graph is fully compiled in TensorRT. Only applicable for `ir="dynamo"`; has no effect for `torch.compile` path assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False disable_tf32 (bool): Whether to disable TF32 computation for TRT layers sparse_weights (bool): Whether to allow the builder to use sparse weights engine_capability (trt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer. dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution dryrun (Union[bool, str]): Toggle "Dryrun" mode, which runs everything through partitioning, short of conversion to TRT Engines. Prints detailed logs of the graph structure and nature of partitioning. Optionally saves the output to a file if a string path is specified hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation cache_built_engines (bool): Whether to save the compiled TRT engines to storage reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage use_strong_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs. use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. refit_identical_engine_weights (bool): Whether to refit the engine with identical weights strip_engine_weights (bool): Whether to strip the engine weights immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable. If this argument is set to true, `strip_engine_weights` and `refit_identical_engine_weights` will be ignored enable_weight_streaming (bool): Enable weight streaming. enable_cross_compile_for_windows (bool): By default this is False means TensorRT engines can only be executed on the same platform where they were built. True will enable cross-platform compatibility which allows the engine to be built on Linux and run on Windows tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True. autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used. autocast_excluded_nodes (Collection[str]): The set of regex patterns to match user-specified node names that should remain in FP32. Default is []. autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is []. autocast_max_output_threshold (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512. autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. This helps prevent excessive accuracy loss in operations particularly sensitive to reduced precision, as higher-depth reductions may amplify computation errors in low precision formats. If not provided, infinity will be used. Default is None. autocast_calibration_dataloader (Optional[torch.utils.data.DataLoader]): The dataloader to use for autocast calibration. Default is None. """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) workspace_size: int = WORKSPACE_SIZE min_block_size: int = MIN_BLOCK_SIZE torch_executed_ops: Collection[Target] = field(default_factory=set) pass_through_build_failures: bool = PASS_THROUGH_BUILD_FAILURES max_aux_streams: Optional[int] = MAX_AUX_STREAMS version_compatible: bool = VERSION_COMPATIBLE optimization_level: Optional[int] = OPTIMIZATION_LEVEL use_python_runtime: Optional[bool] = USE_PYTHON_RUNTIME truncate_double: bool = TRUNCATE_DOUBLE use_fast_partitioner: bool = USE_FAST_PARTITIONER enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS device: Device = field(default_factory=default_device) require_full_compilation: bool = REQUIRE_FULL_COMPILATION disable_tf32: bool = DISABLE_TF32 assume_dynamic_shape_support: bool = ASSUME_DYNAMIC_SHAPE_SUPPORT sparse_weights: bool = SPARSE_WEIGHTS engine_capability: EngineCapability = field( default_factory=lambda: ENGINE_CAPABILITY ) num_avg_timing_iters: int = NUM_AVG_TIMING_ITERS dla_sram_size: int = DLA_SRAM_SIZE dla_local_dram_size: int = DLA_LOCAL_DRAM_SIZE dla_global_dram_size: int = DLA_GLOBAL_DRAM_SIZE dryrun: Union[bool, str] = DRYRUN hardware_compatible: bool = HARDWARE_COMPATIBLE timing_cache_path: str = TIMING_CACHE_PATH lazy_engine_init: bool = LAZY_ENGINE_INIT cache_built_engines: bool = CACHE_BUILT_ENGINES reuse_cached_engines: bool = REUSE_CACHED_ENGINES use_explicit_typing: bool = USE_EXPLICIT_TYPING use_fp32_acc: bool = USE_FP32_ACC refit_identical_engine_weights: bool = REFIT_IDENTICAL_ENGINE_WEIGHTS strip_engine_weights: bool = STRIP_ENGINE_WEIGHTS immutable_weights: bool = IMMUTABLE_WEIGHTS enable_weight_streaming: bool = ENABLE_WEIGHT_STREAMING enable_cross_compile_for_windows: bool = ENABLE_CROSS_COMPILE_FOR_WINDOWS tiling_optimization_level: str = TILING_OPTIMIZATION_LEVEL l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU enable_autocast: bool = ENABLE_AUTOCAST autocast_low_precision_type: Optional[dtype] = AUTOCAST_LOW_PRECISION_TYPE autocast_excluded_nodes: Collection[str] = field( default_factory=lambda: AUTOCAST_EXCLUDED_NODES ) autocast_excluded_ops: Collection[Target] = field( default_factory=lambda: AUTOCAST_EXCLUDED_OPS ) autocast_max_output_threshold: float = AUTOCAST_MAX_OUTPUT_THRESHOLD autocast_max_depth_of_reduction: Optional[int] = AUTOCAST_MAX_DEPTH_OF_REDUCTION autocast_calibration_dataloader: Optional[torch.utils.data.DataLoader] = ( AUTOCAST_CALIBRATION_DATALOADER ) enable_resource_partitioning: bool = ENABLE_RESOURCE_PARTITIONING cpu_memory_budget: int = CPU_MEMORY_BUDGET def __getstate__(self) -> dict[str, Any]: from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( ConverterRegistry, ) state = self.__dict__.copy() state["torch_executed_ops"] = { op if isinstance(op, str) else ConverterRegistry.qualified_name_or_str(op) for op in state["torch_executed_ops"] } return state def __setstate__(self, state: dict[str, Any]) -> None: self.__dict__.update(state)
# If any of the following setting is changed, the engine should be rebuilt. _SETTINGS_TO_BE_ENGINE_INVARIANT = { "enabled_precisions", "max_aux_streams", "version_compatible", "optimization_level", "disable_tf32", "sparse_weights", "engine_capability", "hardware_compatible", "refit_identical_engine_weights", "immutable_weights", "enable_weight_streaming", "tiling_optimization_level", "l2_limit_for_tiling", "enable_autocast", "autocast_low_precision_type", "autocast_excluded_nodes", "autocast_excluded_ops", "autocast_max_output_threshold", "autocast_max_depth_of_reduction", "autocast_calibration_dataloader", } if not hasattr(trt.SerializationFlag, "INCLUDE_REFIT"): # for TensorRT < 10.14 _SETTINGS_TO_BE_ENGINE_INVARIANT.add("strip_engine_weights") def settings_are_compatible( set_a: CompilationSettings, set_b: CompilationSettings ) -> Tuple[bool, Set[str]]: incompatible_settings: Set[str] = set() for f in _SETTINGS_TO_BE_ENGINE_INVARIANT: if getattr(set_a, f) != getattr(set_b, f): incompatible_settings.add(f) if len(incompatible_settings) == 0: return True, set() else: return False, incompatible_settings

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources