| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135 |
- # Copyright 2020 The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
- tokenization_utils_fast.py
- """
- import bisect
- import itertools
- import re
- import unicodedata
- from collections import OrderedDict
- from typing import Any, Optional, Union, overload
- from .tokenization_utils_base import (
- ENCODE_KWARGS_DOCSTRING,
- ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
- INIT_TOKENIZER_DOCSTRING,
- AddedToken,
- BatchEncoding,
- EncodedInput,
- EncodedInputPair,
- PreTokenizedInput,
- PreTokenizedInputPair,
- PreTrainedTokenizerBase,
- TextInput,
- TextInputPair,
- TruncationStrategy,
- )
- from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging
- logger = logging.get_logger(__name__)
- # Slow tokenizers are saved in a vocabulary plus three separated files
- SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
- ADDED_TOKENS_FILE = "added_tokens.json"
- TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
- class Trie:
- """
- Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
- Loose reference https://en.wikipedia.org/wiki/Trie
- """
- def __init__(self, *args):
- self.data = {}
- self._tokens = set()
- self._termination_char = ""
- self.update(*args)
- def update(self, *args):
- """
- Updates the Trie with new tokens provided as arguments.
- Args:
- *args: Variable number of words to be added to the Trie.
- """
- for token in tuple(*args):
- self.add(token)
- def add(self, word: str):
- """
- Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
- The special key `""` in `self._termination_char` is used to represent termination.
- This function is idempotent, adding twice the same word will leave the trie unchanged
- Example:
- ```python
- >>> trie = Trie()
- >>> trie.add("Hello 友達")
- >>> trie.data
- {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
- >>> trie.add("Hello")
- >>> trie.data
- {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
- ```
- """
- if not word:
- # Prevent empty string
- return
- self._tokens.add(word)
- ref = self.data
- for char in word:
- ref[char] = ref.setdefault(char, {})
- ref = ref[char]
- ref[self._termination_char] = 1
- def split(self, text: str) -> list[str]:
- """
- Will look for the words added to the trie within `text`. Output is the original string split along the
- boundaries of the words found.
- This trie will match the longest possible word first !
- Example:
- ```python
- >>> trie = Trie()
- >>> trie.split("[CLS] This is a extra_id_100")
- ["[CLS] This is a extra_id_100"]
- >>> trie.add("[CLS]")
- >>> trie.add("extra_id_1")
- >>> trie.add("extra_id_100")
- >>> trie.split("[CLS] This is a extra_id_100")
- ["[CLS]", " This is a ", "extra_id_100"]
- ```
- """
- # indexes are counted left of the chars index.
- # "hello", index 0, is left of h, index 1 is between h and e.
- # index 5 is right of the "o".
- # States are going to capture every possible start (indexes as above)
- # as keys, and have as values, a pointer to the position in the trie
- # where we're at. This is a partial match for now.
- # This enables to keep track of multiple matches while we're iterating
- # the string
- # If the trie contains, "blowing", and "lower" and we encounter the
- # string "blower", we need to split into ["b", "lower"].
- # This is where we need to keep track of multiple possible starts.
- states = OrderedDict()
- # This will contain every indices where we need
- # to cut.
- # We force to cut at offset 0 and len(text) (added later)
- offsets = [0]
- # This is used by the lookahead which needs to skip over
- # some text where the full match exceeded the place in the initial
- # for loop
- skip = 0
- # Main loop, Giving this algorithm O(n) complexity
- for current, current_char in enumerate(text):
- if skip and current < skip:
- # Prevents the lookahead for matching twice
- # like extra_id_100 and id_100
- continue
- # This will track every state
- # that stop matching, we need to stop tracking them.
- # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
- # fail on "b", we need to remove 0 from the valid states.
- to_remove = set()
- # Whenever we found a match, we need to drop everything
- # this is a greedy algorithm, it will match on the first found token
- reset = False
- # In this case, we already have partial matches (But unfinished)
- for start, trie_pointer in states.items():
- if "" in trie_pointer:
- # This is a final match, we need to reset and
- # store the results in `offsets`.
- # Lookahead to match longest first
- # Important in case of extra_id_1 vs extra_id_100
- # Here we are also actively looking for other earlier partial
- # matches
- # "[CLS]", "L", we need to match CLS even if L is special
- for lookstart, looktrie_pointer in states.items():
- if lookstart > start:
- # This partial match is later, we can stop looking
- break
- elif lookstart < start:
- # This partial match is earlier, the trie pointer
- # was already updated, so index is + 1
- lookahead_index = current + 1
- end = current + 1
- else:
- # Here lookstart == start and
- # looktrie_pointer == trie_pointer
- # It wasn't updated yet so indices are current ones
- lookahead_index = current
- end = current
- next_char = text[lookahead_index] if lookahead_index < len(text) else None
- if "" in looktrie_pointer:
- start = lookstart
- end = lookahead_index
- skip = lookahead_index
- while next_char in looktrie_pointer:
- looktrie_pointer = looktrie_pointer[next_char]
- lookahead_index += 1
- if "" in looktrie_pointer:
- start = lookstart
- end = lookahead_index
- skip = lookahead_index
- if lookahead_index == len(text):
- # End of string
- break
- next_char = text[lookahead_index]
- # End lookahead
- # Storing and resetting
- offsets.append(start)
- offsets.append(end)
- reset = True
- break
- elif current_char in trie_pointer:
- # The current character being looked at has a match within the trie
- # update the pointer (it will be stored back into states later).
- trie_pointer = trie_pointer[current_char]
- # Storing back the new pointer into the states.
- # Partial matches got longer by one.
- states[start] = trie_pointer
- else:
- # The new character has not match in the trie, we need
- # to stop keeping track of this partial match.
- # We can't do it directly within the loop because of how
- # python iteration works
- to_remove.add(start)
- # Either clearing the full start (we found a real match)
- # Or clearing only the partial matches that didn't work.
- if reset:
- states = {}
- else:
- for start in to_remove:
- del states[start]
- # If this character is a starting character within the trie
- # start keeping track of this partial match.
- if current >= skip and current_char in self.data:
- states[current] = self.data[current_char]
- # We have a cut at the end with states.
- for start, trie_pointer in states.items():
- if "" in trie_pointer:
- # This is a final match, we need to reset and
- # store the results in `offsets`.
- end = len(text)
- offsets.append(start)
- offsets.append(end)
- # Longest cut is always the one with lower start so the first
- # item so we need to break.
- break
- return self.cut_text(text, offsets)
- def cut_text(self, text, offsets):
- # We have all the offsets now, we just need to do the actual splitting.
- # We need to eventually add the first part of the string and the eventual
- # last part.
- offsets.append(len(text))
- tokens = []
- start = 0
- for end in offsets:
- if start > end:
- logger.error(
- "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
- " anyway."
- )
- continue
- elif start == end:
- # This might happen if there's a match at index 0
- # we're also preventing zero-width cuts in case of two
- # consecutive matches
- continue
- tokens.append(text[start:end])
- start = end
- return tokens
- class ExtensionsTrie(Trie):
- def __init__(self, *args):
- super().__init__(*args)
- def extensions(self, prefix: str):
- """
- Generates all extensions of a given prefix token in the Trie.
- Example:
- ```python
- >>> trie = Trie()
- >>> trie.add("apple")
- >>> trie.add("app")
- >>> trie.add("application")
- >>> trie.extensions("app")
- ['app', 'apple', 'application']
- ```
- """
- prefix_node = self._get_node(prefix)
- ret = self._collect_tokens(prefix_node)
- return [prefix + token for token in ret]
- def _get_node(self, token: str) -> dict:
- """
- Retrieves the node corresponding to the given token in the Trie.
- Args:
- token (str): The token for which the corresponding node needs to be retrieved.
- Returns:
- dict: The node in the Trie corresponding to the given token.
- """
- node = self.data
- for char in token:
- if char not in node:
- break
- node = node[char]
- return node
- def _collect_tokens(self, node: dict) -> list:
- """
- Generates all tokens in the Trie starting from a given node.
- Args:
- node (dict): The node in the Trie from which tokens need to be generated.
- Returns:
- list: List of tokens generated from the given node.
- """
- tokens = [self._termination_char] if self._termination_char in node else []
- for token, subtrie_head in node.items():
- if token != self._termination_char:
- subtokens = self._collect_tokens(subtrie_head)
- tokens.extend([token + subtoken for subtoken in subtokens])
- return tokens
- def _is_whitespace(char):
- """Checks whether `char` is a whitespace character."""
- # \t, \n, and \r are technically control characters but we treat them
- # as whitespace since they are generally considered as such.
- if char == " " or char == "\t" or char == "\n" or char == "\r":
- return True
- cat = unicodedata.category(char)
- if cat == "Zs":
- return True
- return False
- def _is_control(char):
- """Checks whether `char` is a control character."""
- # These are technically control characters but we count them as whitespace
- # characters.
- if char == "\t" or char == "\n" or char == "\r":
- return False
- cat = unicodedata.category(char)
- if cat.startswith("C"):
- return True
- return False
- def _is_punctuation(char):
- """Checks whether `char` is a punctuation character."""
- cp = ord(char)
- # We treat all non-letter/number ASCII as punctuation.
- # Characters such as "^", "$", and "`" are not in the Unicode
- # Punctuation class but we treat them as punctuation anyways, for
- # consistency.
- if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
- return True
- cat = unicodedata.category(char)
- if cat.startswith("P"):
- return True
- return False
- def _is_end_of_word(text):
- """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
- last_char = text[-1]
- return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
- def _is_start_of_word(text):
- """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
- first_char = text[0]
- return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
- def _insert_one_token_to_ordered_list(token_list: list[str], new_token: str):
- """
- Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
- """
- insertion_idx = bisect.bisect_left(token_list, new_token)
- # Checks if new_token is already in the ordered token_list
- if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
- # new_token is in token_list, don't add
- return
- else:
- token_list.insert(insertion_idx, new_token)
- @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
- class PreTrainedTokenizer(PreTrainedTokenizerBase):
- """
- Base class for all slow tokenizers.
- Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
- Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
- pretrained tokenizers as well as adding tokens to the vocabulary.
- This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
- specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
- """
- def __init__(self, **kwargs):
- # 1. Init the parent class
- self.tokens_trie = Trie()
- # 2. init `_added_tokens_decoder` if child class did not
- if not hasattr(self, "_added_tokens_decoder"):
- self._added_tokens_decoder: dict[int, AddedToken] = {}
- # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite
- self._added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {}))
- self._added_tokens_encoder: dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
- # 4 init the parent class
- super().__init__(**kwargs)
- # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
- # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
- self._add_tokens(
- [token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
- special_tokens=True,
- )
- self._decode_use_source_tokenizer = False
- @property
- def is_fast(self) -> bool:
- return False
- @property
- def vocab_size(self) -> int:
- """
- `int`: Size of the base vocabulary (without the added tokens).
- """
- raise NotImplementedError
- @property
- def added_tokens_encoder(self) -> dict[str, int]:
- """
- Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
- optimisation in `self._added_tokens_encoder` for the slow tokenizers.
- """
- return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
- @property
- def added_tokens_decoder(self) -> dict[int, AddedToken]:
- """
- Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
- Returns:
- `dict[str, int]`: The added tokens.
- """
- return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
- @added_tokens_decoder.setter
- def added_tokens_decoder(self, value: dict[int, Union[AddedToken, str]]) -> dict[int, AddedToken]:
- # Always raise an error if string because users should define the behavior
- for index, token in value.items():
- if not isinstance(token, (str, AddedToken)) or not isinstance(index, int):
- raise TypeError(
- f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}"
- )
- self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token
- self._added_tokens_encoder[str(token)] = index
- self._update_total_vocab_size()
- def get_added_vocab(self) -> dict[str, int]:
- """
- Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
- the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
- something we should change.
- Returns:
- `dict[str, int]`: The added tokens.
- """
- return self._added_tokens_encoder
- def __len__(self):
- """
- Size of the full vocabulary with the added tokens.
- """
- return self.total_vocab_size
- def _update_total_vocab_size(self):
- """
- Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because
- otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and
- is only updated when adding tokens.
- """
- self.total_vocab_size = len(self.get_vocab())
- def _add_tokens(self, new_tokens: Union[list[str], list[AddedToken]], special_tokens: bool = False) -> int:
- """
- Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
- it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
- vocab which is why they have to be handled specifically.
- Args:
- new_tokens (`list[str]`or `list[tokenizers.AddedToken]`):
- Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
- (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
- of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
- stripping and normalization of this token. This is NOT possible in `tokenizers`.
- special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not the tokens should be added as special tokens.
- Returns:
- `int`: The number of tokens actually added to the vocabulary.
- Examples:
- ```python
- # Let's see how to increase the vocabulary of Bert model and tokenizer
- tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
- model = BertModel.from_pretrained("google-bert/bert-base-uncased")
- num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
- print("We have added", num_added_toks, "tokens")
- # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
- model.resize_token_embeddings(len(tokenizer))
- ```"""
- added_tokens = 0
- if new_tokens is None:
- return added_tokens
- # TODO this is fairly slow to improve!
- current_vocab = self.get_vocab().copy()
- new_idx = len(current_vocab) # only call this once, len gives the last index + 1
- for token in new_tokens:
- if not isinstance(token, (str, AddedToken)):
- raise TypeError(f"Token {token} is not a string but a {type(token)}.")
- if str(token) == "":
- continue
- if isinstance(token, str):
- if token in self._added_tokens_encoder:
- continue
- else:
- # very important for fast and slow equivalence!
- is_special = token in self.all_special_tokens or special_tokens
- token = AddedToken(
- token, rstrip=False, lstrip=False, normalized=not is_special, special=is_special
- )
- elif special_tokens:
- # doing token.special=True changes the normalization! will fix in rust
- # this is important and the only reason why the AddedTokens in each class are normalized by default
- token.__setstate__({"special": True, "normalized": token.normalized})
- if token in self._added_tokens_decoder:
- continue
- if not token.special and token.normalized and getattr(self, "do_lower_case", False):
- # Normalize if requested
- token.content = token.content.lower()
- if token.content not in current_vocab:
- token_index = new_idx + added_tokens
- current_vocab[token.content] = token_index
- added_tokens += 1
- else:
- token_index = current_vocab[token.content]
- if token.special and str(token) not in self.all_special_tokens:
- self._special_tokens_map["additional_special_tokens"].append(token)
- # the setter automatically updates the reverse map
- self._added_tokens_decoder[token_index] = token
- self._added_tokens_encoder[token.content] = token_index
- if self.verbose:
- logger.info(f"Adding {token} to the vocabulary")
- self._update_trie()
- self._update_total_vocab_size()
- return added_tokens
- def _update_trie(self, unique_no_split_tokens: Optional[list[str]] = None):
- for token in self._added_tokens_decoder.values():
- if token.content not in self.tokens_trie._tokens:
- self.tokens_trie.add(token.content)
- for token in unique_no_split_tokens or []:
- if token not in self.tokens_trie._tokens:
- self.tokens_trie.add(token)
- def num_special_tokens_to_add(self, pair: bool = False) -> int:
- """
- Returns the number of added tokens when encoding a sequence with special tokens.
- <Tip>
- This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
- this inside your training loop.
- </Tip>
- Args:
- pair (`bool`, *optional*, defaults to `False`):
- Whether the number of added tokens should be computed in the case of a sequence pair or a single
- sequence.
- Returns:
- `int`: Number of special tokens added to sequences.
- """
- token_ids_0 = []
- token_ids_1 = []
- return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
- def tokenize(self, text: TextInput, **kwargs) -> list[str]:
- """
- Converts a string into a sequence of tokens, using the tokenizer.
- Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
- (BPE/SentencePieces/WordPieces). Takes care of added tokens.
- Args:
- text (`str`):
- The sequence to be encoded.
- **kwargs (additional keyword arguments):
- Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
- Returns:
- `list[str]`: The list of tokens.
- """
- split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
- text, kwargs = self.prepare_for_tokenization(text, **kwargs)
- if kwargs:
- logger.warning(f"Keyword arguments {kwargs} not recognized.")
- if hasattr(self, "do_lower_case") and self.do_lower_case:
- # convert non-special tokens to lowercase. Might be super slow as well?
- escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
- escaped_special_toks += [
- re.escape(s_tok.content)
- for s_tok in (self._added_tokens_decoder.values())
- if not s_tok.special and s_tok.normalized
- ]
- pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
- text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
- if split_special_tokens:
- no_split_token = []
- tokens = [text]
- else:
- no_split_token = self._added_tokens_encoder.keys() # don't split on any of the added tokens
- # "This is something<special_token_1> else"
- tokens = self.tokens_trie.split(text)
- # ["This is something", "<special_token_1>", " else"]
- for i, token in enumerate(tokens):
- if token in no_split_token:
- tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
- left = tokens[i - 1] if i > 0 else None
- right = tokens[i + 1] if i < len(tokens) - 1 else None
- if isinstance(tok_extended, AddedToken):
- if tok_extended.rstrip and right:
- # A bit counter-intuitive but we strip the left of the string
- # since tok_extended.rstrip means the special token is eating all white spaces on its right
- tokens[i + 1] = right.lstrip()
- # Strip white spaces on the left
- if tok_extended.lstrip and left:
- tokens[i - 1] = left.rstrip() # Opposite here
- if tok_extended.single_word and left and left[-1] != " ":
- tokens[i - 1] += token
- tokens[i] = ""
- elif tok_extended.single_word and right and right[0] != " ":
- tokens[i + 1] = token + tokens[i + 1]
- tokens[i] = ""
- else:
- raise ValueError(
- f"{tok_extended} cannot be tokenized because it was not properly added"
- f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
- )
- # ["This is something", "<special_token_1>", "else"]
- tokenized_text = []
- for token in tokens:
- # Need to skip eventual empty (fully stripped) tokens
- if not token:
- continue
- if token in no_split_token:
- tokenized_text.append(token)
- else:
- tokenized_text.extend(self._tokenize(token))
- # ["This", " is", " something", "<special_token_1>", "else"]
- return tokenized_text
- def _tokenize(self, text, **kwargs):
- """
- Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
- Do NOT take care of added tokens.
- """
- raise NotImplementedError
- def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]:
- """
- Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
- vocabulary.
- Args:
- tokens (`str` or `list[str]`): One or several token(s) to convert to token id(s).
- Returns:
- `int` or `list[int]`: The token id or list of token ids.
- """
- if tokens is None:
- return None
- if isinstance(tokens, str):
- return self._convert_token_to_id_with_added_voc(tokens)
- ids = []
- for token in tokens:
- ids.append(self._convert_token_to_id_with_added_voc(token))
- return ids
- def _convert_token_to_id_with_added_voc(self, token):
- if token is None:
- return None
- if token in self._added_tokens_encoder:
- return self._added_tokens_encoder[token]
- return self._convert_token_to_id(token)
- def _convert_token_to_id(self, token):
- raise NotImplementedError
- def _encode_plus(
- self,
- text: Union[TextInput, PreTokenizedInput, EncodedInput],
- text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- is_split_into_words: bool = False,
- pad_to_multiple_of: Optional[int] = None,
- padding_side: Optional[str] = None,
- return_tensors: Optional[Union[str, TensorType]] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_offsets_mapping: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- **kwargs,
- ) -> BatchEncoding:
- def get_input_ids(text):
- if isinstance(text, str):
- tokens = self.tokenize(text, **kwargs)
- return self.convert_tokens_to_ids(tokens)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
- if is_split_into_words:
- tokens = list(
- itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
- )
- return self.convert_tokens_to_ids(tokens)
- else:
- return self.convert_tokens_to_ids(text)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
- return text
- else:
- if is_split_into_words:
- raise ValueError(
- f"Input {text} is not valid. Should be a string or a list/tuple of strings when"
- " `is_split_into_words=True`."
- )
- else:
- raise ValueError(
- f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of"
- " integers."
- )
- if return_offsets_mapping:
- raise NotImplementedError(
- "return_offset_mapping is not available when using Python tokenizers. "
- "To use this feature, change your tokenizer to one deriving from "
- "transformers.PreTrainedTokenizerFast. "
- "More information on available tokenizers at "
- "https://github.com/huggingface/transformers/pull/2674"
- )
- first_ids = get_input_ids(text)
- second_ids = get_input_ids(text_pair) if text_pair is not None else None
- return self.prepare_for_model(
- first_ids,
- pair_ids=second_ids,
- add_special_tokens=add_special_tokens,
- padding=padding_strategy.value,
- truncation=truncation_strategy.value,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=pad_to_multiple_of,
- padding_side=padding_side,
- return_tensors=return_tensors,
- prepend_batch_axis=True,
- return_attention_mask=return_attention_mask,
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- verbose=verbose,
- )
- def _batch_encode_plus(
- self,
- batch_text_or_text_pairs: Union[
- list[TextInput],
- list[TextInputPair],
- list[PreTokenizedInput],
- list[PreTokenizedInputPair],
- list[EncodedInput],
- list[EncodedInputPair],
- ],
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- is_split_into_words: bool = False,
- pad_to_multiple_of: Optional[int] = None,
- padding_side: Optional[str] = None,
- return_tensors: Optional[Union[str, TensorType]] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_offsets_mapping: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- split_special_tokens: bool = False,
- **kwargs,
- ) -> BatchEncoding:
- def get_input_ids(text):
- if isinstance(text, str):
- tokens = self.tokenize(text, **kwargs)
- return self.convert_tokens_to_ids(tokens)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
- if is_split_into_words:
- tokens = list(
- itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
- )
- return self.convert_tokens_to_ids(tokens)
- else:
- return self.convert_tokens_to_ids(text)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
- return text
- else:
- raise ValueError(
- "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
- )
- if return_offsets_mapping:
- raise NotImplementedError(
- "return_offset_mapping is not available when using Python tokenizers. "
- "To use this feature, change your tokenizer to one deriving from "
- "transformers.PreTrainedTokenizerFast."
- )
- input_ids = []
- for ids_or_pair_ids in batch_text_or_text_pairs:
- if (
- not isinstance(ids_or_pair_ids, (list, tuple))
- or is_split_into_words
- and not isinstance(ids_or_pair_ids[0], (list, tuple))
- ):
- ids, pair_ids = ids_or_pair_ids, None
- else:
- ids, pair_ids = ids_or_pair_ids
- first_ids = get_input_ids(ids)
- second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
- input_ids.append((first_ids, second_ids))
- batch_outputs = self._batch_prepare_for_model(
- input_ids,
- add_special_tokens=add_special_tokens,
- padding_strategy=padding_strategy,
- truncation_strategy=truncation_strategy,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=pad_to_multiple_of,
- padding_side=padding_side,
- return_attention_mask=return_attention_mask,
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- return_tensors=return_tensors,
- verbose=verbose,
- split_special_tokens=split_special_tokens,
- )
- return BatchEncoding(batch_outputs)
- @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
- def _batch_prepare_for_model(
- self,
- batch_ids_pairs: list[Union[PreTokenizedInputPair, tuple[list[int], None]]],
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- pad_to_multiple_of: Optional[int] = None,
- padding_side: Optional[str] = None,
- return_tensors: Optional[str] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- split_special_tokens: bool = False,
- ) -> BatchEncoding:
- """
- Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
- adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
- manages a moving window (with user defined stride) for overflowing tokens
- Args:
- batch_ids_pairs: list of tokenized input ids or input ids pairs
- """
- batch_outputs = {}
- for first_ids, second_ids in batch_ids_pairs:
- outputs = self.prepare_for_model(
- first_ids,
- second_ids,
- add_special_tokens=add_special_tokens,
- padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
- truncation=truncation_strategy.value,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=None, # we pad in batch afterward
- padding_side=None, # we pad in batch afterward
- return_attention_mask=False, # we pad in batch afterward
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- return_tensors=None, # We convert the whole batch to tensors at the end
- prepend_batch_axis=False,
- verbose=verbose,
- split_special_tokens=split_special_tokens,
- )
- for key, value in outputs.items():
- if key not in batch_outputs:
- batch_outputs[key] = []
- batch_outputs[key].append(value)
- batch_outputs = self.pad(
- batch_outputs,
- padding=padding_strategy.value,
- max_length=max_length,
- pad_to_multiple_of=pad_to_multiple_of,
- padding_side=padding_side,
- return_attention_mask=return_attention_mask,
- )
- batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
- return batch_outputs
- def prepare_for_tokenization(
- self, text: str, is_split_into_words: bool = False, **kwargs
- ) -> tuple[str, dict[str, Any]]:
- """
- Performs any necessary transformations before tokenization.
- This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
- `kwargs` at the end of the encoding process to be sure all the arguments have been used.
- Args:
- text (`str`):
- The text to prepare.
- is_split_into_words (`bool`, *optional*, defaults to `False`):
- Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
- tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
- which it will tokenize. This is useful for NER or token classification.
- kwargs (`dict[str, Any]`, *optional*):
- Keyword arguments to use for the tokenization.
- Returns:
- `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs.
- """
- return (text, kwargs)
- def get_special_tokens_mask(
- self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False
- ) -> list[int]:
- """
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
- Args:
- token_ids_0 (`list[int]`):
- List of ids of the first sequence.
- token_ids_1 (`list[int]`, *optional*):
- List of ids of the second sequence.
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not the token list is already formatted with special tokens for the model.
- Returns:
- A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
- """
- if already_has_special_tokens:
- if token_ids_1 is not None:
- raise ValueError(
- "You should not supply a second sequence if the provided sequence of "
- "ids is already formatted with special tokens for the model."
- )
- return super().get_special_tokens_mask(
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
- )
- return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
- @overload
- def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
- @overload
- def convert_ids_to_tokens(self, ids: list[int], skip_special_tokens: bool = False) -> list[str]: ...
- def convert_ids_to_tokens(
- self, ids: Union[int, list[int]], skip_special_tokens: bool = False
- ) -> Union[str, list[str]]:
- """
- Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
- added tokens.
- Args:
- ids (`int` or `list[int]`):
- The token id (or token ids) to convert to tokens.
- skip_special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not to remove special tokens in the decoding.
- Returns:
- `str` or `list[str]`: The decoded token(s).
- """
- if isinstance(ids, int):
- if ids in self._added_tokens_decoder:
- return self._added_tokens_decoder[ids].content
- else:
- return self._convert_id_to_token(ids)
- tokens = []
- for index in ids:
- index = int(index)
- if skip_special_tokens and index in self.all_special_ids:
- continue
- if index in self._added_tokens_decoder:
- tokens.append(self._added_tokens_decoder[index].content)
- else:
- tokens.append(self._convert_id_to_token(index))
- return tokens
- def _convert_id_to_token(self, index: int) -> str:
- raise NotImplementedError
- def convert_tokens_to_string(self, tokens: list[str]) -> str:
- return " ".join(tokens)
- def _decode(
- self,
- token_ids: Union[int, list[int]],
- skip_special_tokens: bool = False,
- clean_up_tokenization_spaces: Optional[bool] = None,
- spaces_between_special_tokens: bool = True,
- **kwargs,
- ) -> str:
- self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
- filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
- # If given is a single id, prevents splitting the string in upcoming loop
- if isinstance(filtered_tokens, str):
- filtered_tokens = [filtered_tokens]
- legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
- token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
- }
- # To avoid mixing byte-level and unicode for byte-level BPT
- # we need to build string separately for added tokens and byte-level tokens
- # cf. https://github.com/huggingface/transformers/issues/1133
- sub_texts = []
- current_sub_text = []
- # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string
- for token in filtered_tokens:
- if skip_special_tokens and token in self.all_special_tokens:
- continue
- if token in legacy_added_tokens:
- if current_sub_text:
- string = self.convert_tokens_to_string(current_sub_text)
- if len(string) > 0:
- sub_texts.append(string)
- current_sub_text = []
- sub_texts.append(token)
- else:
- current_sub_text.append(token)
- if current_sub_text:
- sub_texts.append(self.convert_tokens_to_string(current_sub_text))
- if spaces_between_special_tokens:
- text = " ".join(sub_texts)
- else:
- text = "".join(sub_texts)
- clean_up_tokenization_spaces = (
- clean_up_tokenization_spaces
- if clean_up_tokenization_spaces is not None
- else self.clean_up_tokenization_spaces
- )
- if clean_up_tokenization_spaces:
- clean_text = self.clean_up_tokenization(text)
- return clean_text
- else:
- return text
|