Shortcuts

Source code for torchtune.models.llama3._tokenizer

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from typing import Dict, List, Optional, Tuple

from torchtune.data import Message, truncate
from torchtune.modules.tokenizers import ModelTokenizer, TikTokenBaseTokenizer


CL100K_PATTERN = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""  # noqa

PAD_ID = 0

LLAMA3_SPECIAL_TOKENS = {
    "<|begin_of_text|>": 128000,
    "<|end_of_text|>": 128001,
    "<|start_header_id|>": 128006,
    "<|end_header_id|>": 128007,
    "<|eot_id|>": 128009,
    "<|eom_id|>": 128008,
    "<|python_tag|>": 128255,
}


[docs]class Llama3Tokenizer(ModelTokenizer): """ tiktoken tokenizer configured with Llama3 Instruct's special tokens, as described in https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3 Args: path (str): Path to pretrained tiktoken tokenizer file. special_tokens (Optional[Dict[str, int]]): mapping containing special text tokens and their registered token IDs. If left as None, this will be set to the canonical Llama3 special tokens. Examples: >>> tokenizer = Llama3Tokenizer("/path/to/tt_model") >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True) >>> print(tokenized_text) [1, 31587, 29644, 102, 2] """ def __init__( self, path: str, special_tokens: Optional[Dict[str, int]] = None, ): self.special_tokens = ( special_tokens if special_tokens is not None else LLAMA3_SPECIAL_TOKENS ) self._validate_special_tokens() # Encode BOS and EOS, define pad ID self.bos_id = self.special_tokens["<|begin_of_text|>"] self.eos_id = self.special_tokens["<|end_of_text|>"] self.pad_id = PAD_ID # Encode extra special tokens self.start_header_id = self.special_tokens["<|start_header_id|>"] self.end_header_id = self.special_tokens["<|end_header_id|>"] self.eot_id = self.special_tokens["<|eot_id|>"] self.eom_id = self.special_tokens["<|eom_id|>"] self.python_tag = self.special_tokens["<|python_tag|>"] # During generation, stop when either eos_id or eot_id is encountered self.stop_tokens = [self.eos_id, self.eot_id] self.tt_model = TikTokenBaseTokenizer( path=path, name="llama3_tiktoken", pattern=CL100K_PATTERN, bos_id=self.bos_id, eos_id=self.eos_id, special_tokens=self.special_tokens, ) def _validate_special_tokens( self, ): """ Validate that required special tokens are passed into the tokenizer. The following special tokens are required: <|begin_of_text|>, <|end_of_text|>, <|start_header_id|>, <|end_header_id|>, <|eot_id|>, <|eom_id|>, <|python_tag|> """ for token in LLAMA3_SPECIAL_TOKENS.keys(): if token not in self.special_tokens: raise ValueError(f"{token} missing from special_tokens") @property def base_vocab_size(self) -> int: return self.tt_model.base_vocab_size @property def vocab_size(self) -> int: return self.tt_model.vocab_size def encode( self, text: str, add_bos: bool = True, add_eos: bool = True, ) -> List[int]: return self.tt_model.encode(text=text, add_bos=add_bos, add_eos=add_eos)
[docs] def decode( self, token_ids: List[int], truncate_at_eos: bool = True, ) -> str: """ Decode a list of token ids into a string. Args: token_ids (List[int]): The list of token ids. truncate_at_eos (bool): Whether to truncate the string at the end of sequence token. Default is True. Returns: str: The decoded string. """ return self.tt_model.decode(token_ids, truncate_at_eos=truncate_at_eos)
[docs] def tokenize_message( self, message: Message, tokenize_header: bool = False ) -> List[int]: """ Tokenize a message into a list of token ids. Args: message (Message): The message to tokenize. tokenize_header (bool): Whether to prepend a tokenized header to each message. Returns: List[int]: The list of token ids. """ if tokenize_header: tokenized_header = ( [self.start_header_id] + self.encode(message.role.strip(), add_bos=False, add_eos=False) + [self.end_header_id] + self.encode("\n\n", add_bos=False, add_eos=False) ) else: tokenized_header = [] tokenized_body = self.encode( message.content.strip(), add_bos=False, add_eos=False ) if message.ipython: tokenized_body = [self.python_tag] + tokenized_body tokenized_message = tokenized_header + tokenized_body if message.eot: tokenized_message = tokenized_message + [self.eot_id] else: tokenized_message = tokenized_message + [self.eom_id] return tokenized_message
[docs] def tokenize_messages( self, messages: List[Message], max_seq_len: Optional[int] = None, tokenize_header: bool = True, add_eos: bool = True, ) -> Tuple[List[int], List[bool]]: """ Tokenize a list of messages into a list of token ids and masks. Args: messages (List[Message]): The list of messages to tokenize. max_seq_len (Optional[int]): The maximum sequence length. tokenize_header (bool): Whether to prepend a tokenized header to each message. Returns: Tuple[List[int], List[bool]]: The list of token ids and the list of masks. """ tokens = [self.bos_id] # bos and eos are always masked mask = [True] for message in messages: tokenized_message = self.tokenize_message( message, tokenize_header=tokenize_header ) tokens = tokens + tokenized_message mask = mask + ([message.masked] * len(tokenized_message)) if max_seq_len and len(tokens) >= max_seq_len: break if add_eos: tokens = tokens + [self.eos_id] mask = mask + [True] if max_seq_len: tokens = truncate(tokens, max_seq_len, self.eos_id) mask = truncate(mask, max_seq_len, True) return tokens, mask

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources