Shortcuts

Source code for torchtune.modules.transforms.tokenizers._hf_tokenizer

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import json
from typing import Any, Dict, List, Optional

from tokenizers import Tokenizer
from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer


[docs]class HuggingFaceBaseTokenizer(BaseTokenizer): """ A wrapper around Hugging Face tokenizers. See https://github.com/huggingface/tokenizers This can be used to load from a Hugging Face tokenizer.json file into a torchtune BaseTokenizer. This class will load the tokenizer.json file from tokenizer_json_path. It will attempt to infer BOS and EOS token IDs from config.json if possible, and if not will fallback to inferring them from generation_config.json. Args: tokenizer_json_path (str): Path to tokenizer.json file tokenizer_config_json_path (Optional[str]): Path to tokenizer_config.json file. Default: None generation_config_path (Optional[str]): Path to generation_config.json file. Default: None Raises: ValueError: If neither tokenizer_config_json_path or generation_config_path are specified. """ def __init__( self, tokenizer_json_path: str, *, tokenizer_config_json_path: Optional[str] = None, generation_config_path: Optional[str] = None, ): self.tokenizer = Tokenizer.from_file(tokenizer_json_path) if not (tokenizer_config_json_path or generation_config_path): raise ValueError( "At least one of tokenizer_config_json_path or generation_config_path must be specified." ) if tokenizer_config_json_path: with open(tokenizer_config_json_path, "rb") as f: self.config = json.load(f) else: self.config = None if generation_config_path: with open(generation_config_path, "rb") as f: self.generation_config = json.load(f) else: self.generation_config = None self._infer_bos_eos_tokens() self._infer_should_add_bos_eos() def _get_token_from_config(self, config: Dict[str, Any], key: str) -> str: """ HF BOS/EOS tokens are either stored as e.g. {'bos_token': 5} or {'bos_token': {'content': 5, ...}}. This utility handles both. """ token = config.get(key) if isinstance(token, Dict): if "content" not in token: raise ValueError(f"Could not parse {key} from config") token = token["content"] else: if not isinstance(token, str): raise ValueError(f"Could not parse {key} from config") return token def _infer_bos_eos_tokens(self): """ Infer BOS and EOS token IDs from config and/or generation_config. Will first try to infer token from config then map to ID. If that's not available, will infer ID directly from generation_config. Otherwise, raise a ValueError. """ self.bos_id = None self.eos_id = None if self.config: bos_token = self._get_token_from_config(self.config, "bos_token") eos_token = self._get_token_from_config(self.config, "eos_token") if bos_token is not None: self.bos_id = self.tokenizer.token_to_id(bos_token) if eos_token is not None: self.eos_id = self.tokenizer.token_to_id(eos_token) if self.generation_config: if self.bos_id is None: self.bos_id = self.generation_config.get("bos_token_id") if self.eos_id is None: self.eos_id = self.generation_config.get("eos_token_id") if self.bos_id is None or self.eos_id is None: raise ValueError("Could not infer BOS and EOS token IDs from config") def _infer_should_add_bos_eos(self): """ Hugging Face tokenizers sometimes add BOS by default. We should infer this to determine whether to add it ourselves in encode. Otherwise we will get duplicate BOS tokens. """ self.hf_adds_bos, self.hf_adds_eos = False, False encoded_empty_str = self.tokenizer.encode("").ids if self.bos_id in encoded_empty_str: self.hf_adds_bos = True if self.eos_id in encoded_empty_str: self.hf_adds_eos = True
[docs] def encode( self, text: str, add_bos: bool = True, add_eos: bool = True ) -> List[int]: """ Encodes a string into a list of token ids. Args: text (str): The text to encode. add_bos (bool): Whether to add the tokenizer's bos_id to the encoded string. Default True. add_eos (bool): Whether to add the tokenizer's eos_id to the encoded string. Default True. Returns: List[int]: The list of token ids. """ token_ids = self.tokenizer.encode(text).ids if add_bos and not self.hf_adds_bos: token_ids.insert(0, self.bos_id) if add_eos and not self.hf_adds_eos: token_ids.append(self.eos_id) return token_ids
[docs] def decode(self, token_ids: List[int]) -> str: """ Decode a list of token ids into a string. Args: token_ids (List[int]): The list of token ids. Returns: str: The decoded string. """ return self.tokenizer.decode(token_ids)

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources