Shortcuts

Source code for torchao.float8.float8_linear_utils

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD 3-Clause license found in the
# LICENSE file in the root directory of this source tree.
import logging
from functools import partial
from typing import Callable, List, Optional, Union

import torch.nn as nn

from torchao.float8.config import Float8LinearConfig, Float8LinearRecipeName
from torchao.float8.float8_linear import Float8Linear

log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())


def swap_linear_layers(
    module: nn.Module,
    from_float_func: Callable[[nn.Linear], nn.Linear],
    *,
    module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None,
) -> nn.Module:
    """
    Generic function to swap linear layers in a module with a new type of linear layer.

    Note:
        If applied to a root-level nn.Linear, the module will not be modified in place
        and returned instead

    Args:
        module: Module to modify.
        from_float_func: Function that accepts a linear layer and returns a new type of linear layer.
        module_filter_fn: If specified, only the `torch.nn.Linear` subclasses that
            that pass the filter function will be swapped. The inputs to the
            filter function are the module instance, and the FQN.

    Returns:
     nn.Module: The modified module with swapped linear layers.
    """
    if isinstance(module, nn.Linear) and (
        module_filter_fn is None or module_filter_fn(module, "")
    ):
        if len(list(module.children())) > 0:
            raise AssertionError(
                f"Does not support a root nn.Linear with children: {module}"
            )
        return from_float_func(
            module,
        )

    root_module = module

    def post_order_traversal(
        module: nn.Module,
        cur_fqn: Optional[str] = None,
        parent_module: Optional[nn.Module] = None,
    ):
        if cur_fqn is None:
            cur_fqn = ""

        for child_module_name, child_module in module.named_children():
            if cur_fqn == "":
                new_fqn = child_module_name
            else:
                new_fqn = f"{cur_fqn}.{child_module_name}"

            post_order_traversal(child_module, new_fqn, module)

        if isinstance(module, nn.Linear) and (
            module_filter_fn is None or module_filter_fn(module, cur_fqn)
        ):
            assert parent_module is not None, (
                f"Linear root module should return early: {module}"
            )
            new_linear_module = from_float_func(module)
            cur_module_name = cur_fqn.split(".")[-1]
            setattr(parent_module, cur_module_name, new_linear_module)

    post_order_traversal(root_module)
    return root_module


[docs]def convert_to_float8_training( module: nn.Module, *, module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None, config: Optional[Float8LinearConfig] = None, ) -> nn.Module: """ Swaps `torch.nn.Linear` in `module` with `Float8Linear`. Args: module: Module to modify. module_filter_fn: If specified, only the `torch.nn.Linear` subclasses that that pass the filter function will be swapped. The inputs to the filter function are the module instance and the FQN. config (Float8LinearConfig): configuration for conversion to float8 Returns: nn.Module: The modified module with swapped linear layers. """ if config is None: config = Float8LinearConfig() from_float = lambda m: Float8Linear.from_float( m, config=config, ) return swap_linear_layers( module, from_float, module_filter_fn=module_filter_fn, )
def _auto_filter_for_recipe( recipe: Union[str, Float8LinearRecipeName], filter_fqns: List[str] ) -> Callable[[nn.Module, str], bool]: """Returns function which automatically filters nn.Linear modules that meet at least one of the following criteria: 1. Dims not divisible by 16 (hardware requirement for float8). 2. Dim sizes below certain thresholds, which may result in worse performance. NOTE: the thresholds are simple heuristics based on performance testing, and may not be optimal for your model. For the best performance, we recommend defining your own module_filter_fn customized for your module, using the performance tables for the given float8 recipe here: https://github.com/pytorch/ao/tree/main/torchao/float8#performance). These benchmarks referenced for auto filtering layers were run on H100 GPUs, and may not be representative of other hardware. This is an experimental API, the design may change in the future. """ if isinstance(recipe, str): recipe = Float8LinearRecipeName(recipe) if recipe == Float8LinearRecipeName.TENSORWISE: return partial(_auto_filter_for_tensorwise, filter_fqns=filter_fqns) elif recipe == Float8LinearRecipeName.ROWWISE: return partial(_auto_filter_for_rowwise, filter_fqns=filter_fqns) elif recipe == Float8LinearRecipeName.ROWWISE_WITH_GW_HP: raise NotImplementedError(f"Unsupported recipe: {recipe}") else: raise ValueError(f"Invalid recipe: {recipe}") def _auto_filter_for_rowwise(mod: nn.Module, fqn: str, filter_fqns: List[str]) -> bool: if not isinstance(mod, nn.Linear): return False # If the fqn matches any filtered fqn, then we should not convert this module. is_filtered_fqn = any(filter_fqn in fqn for filter_fqn in filter_fqns) if is_filtered_fqn: return False # All dims must be divisible by 16 due to float8 hardware requirements. N, K = mod.weight.shape dims_multiples_of_16 = K % 16 == 0 and N % 16 == 0 if not dims_multiples_of_16: return False # Dims below these thresholds may result in worse performance # (see https://github.com/pytorch/ao/tree/main/torchao/float8#rowwise-scaling) # Note that these benchmarks referenced for auto filtering layers were run on # H100 GPUs, and may not be representative of other hardware. if N <= 2048: return False elif K <= 1024: return False elif N <= 4096 and K <= 2048: return False return True def _auto_filter_for_tensorwise( mod: nn.Module, fqn: str, filter_fqns: List[str] ) -> bool: if not isinstance(mod, nn.Linear): return False # If the fqn matches any filtered fqn, then we should not convert this module. is_filtered_fqn = any(filter_fqn in fqn for filter_fqn in filter_fqns) if is_filtered_fqn: return False # All dims must be divisible by 16 due to float8 hardware requirements. N, K = mod.weight.shape dims_multiples_of_16 = K % 16 == 0 and N % 16 == 0 if not dims_multiples_of_16: return False # Dims below these thresholds may result in worse performance # (see https://github.com/pytorch/ao/tree/main/torchao/float8#tensorwise-scaling) # Note that these benchmarks referenced for auto filtering layers were run on # H100 GPUs, and may not be representative of other hardware. if K <= 4096 and N <= 1024: return False return True

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources