yichael
/
AutoAndroidController


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243
							from pathlib import Path
from typing import Any

from transformers.convert_slow_tokenizer import TikTokenConverter
from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE


def convert_tiktoken_to_fast(encoding: Any, output_dir: str):
    """
    Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer
    on disk.

    Args:
        encoding (`str` or `tiktoken.Encoding`):
            Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with
            `tiktoken.get_encoding(encoding)`.
        output_dir (`str`):
            Save path for converted tokenizer configuration file.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)

    save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE
    tokenizer_file = output_dir / TOKENIZER_FILE

    save_file_absolute = str(save_file.absolute())
    output_file_absolute = str(tokenizer_file.absolute())

    try:
        from tiktoken import get_encoding
        from tiktoken.load import dump_tiktoken_bpe

        if isinstance(encoding, str):
            encoding = get_encoding(encoding)

        dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute)
    except ImportError:
        raise ValueError("`tiktoken` is required to save a `tiktoken` file. Install it with `pip install tiktoken`.")

    tokenizer = TikTokenConverter(
        vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens
    ).converted()
    tokenizer.save(output_file_absolute)