Shortcuts

Source code for torchtune.data._chat_formats

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from abc import ABC, abstractmethod
from typing import List

from torchtune.data._types import Message


[docs]class ChatFormat(ABC): """ Interface for chat formats. Each chat format should include template prompts with placeholders for the data inputs. There should be a template for each role: system, user, assistant. """ system = "" user = "" assistant = ""
[docs] @classmethod @abstractmethod def format( cls, sample: List[Message], ) -> List[Message]: """ Format each role's message(s) according to the chat format Args: sample (List[Message]): a single conversation, structured as a list of `Message` objects Returns: The formatted list of messages """ pass
[docs]class Llama2ChatFormat(ChatFormat): """ Chat format that formats human and system prompts with appropriate tags used in LLaMA2 pre-training. Taken from Meta's official `LLaMA inference repository <https://github.com/meta-llama/llama/blob/main/llama/generation.py>`_. Example: .. code-block:: text "[INST] <<SYS>> You are a helpful, respectful and honest assistant. <</SYS>>" I am going to Paris, what should I see? [/INST] Paris, the capital of France, is known for its stunning architecture..." """ B_INST, E_INST = "[INST]", "[/INST]" B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n" system = f"{B_SYS}{{content}}{E_SYS}" user = f"{B_INST} {{system_message}}{{content}} {E_INST} " assistant = ""
[docs] @classmethod def format( cls, sample: List[Message], ) -> List[Message]: """ Format user and system messages with appropriate tags. Args: sample (List[Message]): a single conversation, structured as a list of `Message` objects Returns: The formatted list of messages """ system_message = "" formatted_dialogue = [] for message in sample: content = "" if message.role == "system": content = cls.system.format(content=message.content) system_message = content # Incorporate the system message in the user message - LLaMA2 only # looks for the <<SYS>> tags and not the explicit role so this will # be treated the same as an actual system message. We do this because # of the nesting of the system prompt in the user message. continue elif message.role == "user": content = cls.user.format( system_message=system_message, content=message.content ) elif message.role == "assistant": # No special formatting needed for assistant message content = message.content assert content != "" formatted_dialogue.append( Message(role=message.role, content=content, masked=message.masked), ) return formatted_dialogue
[docs]class MistralChatFormat(ChatFormat): """ Formats according to `Mistral's instruct model <https://docs.mistral.ai/models/>`_. It is identical to `Llama2ChatFormat`, except it does not support system prompts. Example: .. code-block:: text "[INST] I am going to Paris, what should I see? [/INST] Paris, the capital of France, is known for its stunning architecture..." """ B_INST, E_INST = "[INST]", "[/INST]" system = None user = f"{B_INST} {{content}} {E_INST} " assistant = ""
[docs] @classmethod def format( cls, sample: List[Message], ) -> List[Message]: """ Format user and system messages with appropriate tags. Args: sample (List[Message]): a single conversation, structured as a list of `Message` objects Returns: The formatted list of messages Raises: ValueError: If system prompts are provided """ formatted_dialogue = [] for message in sample: content = "" if message.role == "system": raise ValueError( "System prompts are not supported in MistralChatFormat" ) elif message.role == "user": content = cls.user.format( content=message.content, ) elif message.role == "assistant": # No special formatting needed for assistant message content = message.content assert content != "" formatted_dialogue.append( Message(role=message.role, content=content, masked=message.masked), ) return formatted_dialogue
[docs]class ChatMLFormat(ChatFormat): """ OpenAI's `Chat Markup Language <https://github.com/MicrosoftDocs/azure-docs/blob/772c14eeabfa0c0c561d5c2d34ef19341f528b7b/articles/ai-services/openai/how-to/chat-markup-language.md>`_ used by their chat models. It is the default chat format used by HuggingFace models. Example: .. code-block:: text <|im_start|>system Provide some context and/or instructions to the model.<|im_end|> <|im_start|>user The user’s message goes here<|im_end|> <|im_start|>assistant The assistant’s response goes here<|im_end|> """ IM_START, IM_END = "<|im_start|>", "<|im_end|>" system = f"{IM_START}system\n{{content}}{IM_END}\n" user = f"{IM_START}user\n{{content}}{IM_END}\n" assistant = f"{IM_START}assistant\n{{content}}{IM_END}"
[docs] @classmethod def format( cls, sample: List[Message], ) -> List[Message]: """ Format user and system messages with appropriate tags. Args: sample (List[Message]): a single conversation, structured as a list of `Message` objects Returns: The formatted list of messages """ formatted_dialogue = [] for message in sample: content = "" if message.role == "system": content = cls.system.format(content=message.content) elif message.role == "user": content = cls.user.format( content=message.content, ) elif message.role == "assistant": content = cls.assistant.format( content=message.content, ) assert content != "" formatted_dialogue.append( Message(role=message.role, content=content, masked=message.masked), ) return formatted_dialogue

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources