| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800 |
- # Generated content DO NOT EDIT
- class AddedToken:
- """
- Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
- It can have special options that defines the way it should behave.
- Args:
- content (:obj:`str`): The content of the token
- single_word (:obj:`bool`, defaults to :obj:`False`):
- Defines whether this token should only match single words. If :obj:`True`, this
- token will never match inside of a word. For example the token ``ing`` would match
- on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
- The notion of "`inside of a word`" is defined by the word boundaries pattern in
- regular expressions (ie. the token should start and end with word boundaries).
- lstrip (:obj:`bool`, defaults to :obj:`False`):
- Defines whether this token should strip all potential whitespaces on its left side.
- If :obj:`True`, this token will greedily match any whitespace on its left. For
- example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
- ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
- rstrip (:obj:`bool`, defaults to :obj:`False`):
- Defines whether this token should strip all potential whitespaces on its right
- side. If :obj:`True`, this token will greedily match any whitespace on its right.
- It works just like :obj:`lstrip` but on the right.
- normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
- Defines whether this token should match against the normalized version of the input
- text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
- lowercasing the text, the token could be extract from the input ``"I saw a lion
- Yesterday"``.
- special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
- Defines whether this token should be skipped when decoding.
- """
- def __init__(self, content=None, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
- pass
- def __getstate__(self):
- """ """
- pass
- def __setstate__(self, state):
- """ """
- pass
- @property
- def content(self):
- """
- Get the content of this :obj:`AddedToken`
- """
- pass
- @content.setter
- def content(self, value):
- """
- Get the content of this :obj:`AddedToken`
- """
- pass
- @property
- def lstrip(self):
- """
- Get the value of the :obj:`lstrip` option
- """
- pass
- @lstrip.setter
- def lstrip(self, value):
- """
- Get the value of the :obj:`lstrip` option
- """
- pass
- @property
- def normalized(self):
- """
- Get the value of the :obj:`normalized` option
- """
- pass
- @normalized.setter
- def normalized(self, value):
- """
- Get the value of the :obj:`normalized` option
- """
- pass
- @property
- def rstrip(self):
- """
- Get the value of the :obj:`rstrip` option
- """
- pass
- @rstrip.setter
- def rstrip(self, value):
- """
- Get the value of the :obj:`rstrip` option
- """
- pass
- @property
- def single_word(self):
- """
- Get the value of the :obj:`single_word` option
- """
- pass
- @single_word.setter
- def single_word(self, value):
- """
- Get the value of the :obj:`single_word` option
- """
- pass
- @property
- def special(self):
- """
- Get the value of the :obj:`special` option
- """
- pass
- @special.setter
- def special(self, value):
- """
- Get the value of the :obj:`special` option
- """
- pass
- class Encoding:
- """
- The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
- """
- def __init__(self):
- pass
- def __getstate__(self):
- """ """
- pass
- def __setstate__(self, state):
- """ """
- pass
- @property
- def attention_mask(self):
- """
- The attention mask
- This indicates to the LM which tokens should be attended to, and which should not.
- This is especially important when batching sequences, where we need to applying
- padding.
- Returns:
- :obj:`List[int]`: The attention mask
- """
- pass
- @attention_mask.setter
- def attention_mask(self, value):
- """
- The attention mask
- This indicates to the LM which tokens should be attended to, and which should not.
- This is especially important when batching sequences, where we need to applying
- padding.
- Returns:
- :obj:`List[int]`: The attention mask
- """
- pass
- def char_to_token(self, char_pos, sequence_index=0):
- """
- Get the token that contains the char at the given position in the input sequence.
- Args:
- char_pos (:obj:`int`):
- The position of a char in the input string
- sequence_index (:obj:`int`, defaults to :obj:`0`):
- The index of the sequence that contains the target char
- Returns:
- :obj:`int`: The index of the token that contains this char in the encoded sequence
- """
- pass
- def char_to_word(self, char_pos, sequence_index=0):
- """
- Get the word that contains the char at the given position in the input sequence.
- Args:
- char_pos (:obj:`int`):
- The position of a char in the input string
- sequence_index (:obj:`int`, defaults to :obj:`0`):
- The index of the sequence that contains the target char
- Returns:
- :obj:`int`: The index of the word that contains this char in the input sequence
- """
- pass
- @property
- def ids(self):
- """
- The generated IDs
- The IDs are the main input to a Language Model. They are the token indices,
- the numerical representations that a LM understands.
- Returns:
- :obj:`List[int]`: The list of IDs
- """
- pass
- @ids.setter
- def ids(self, value):
- """
- The generated IDs
- The IDs are the main input to a Language Model. They are the token indices,
- the numerical representations that a LM understands.
- Returns:
- :obj:`List[int]`: The list of IDs
- """
- pass
- @staticmethod
- def merge(encodings, growing_offsets=True):
- """
- Merge the list of encodings into one final :class:`~tokenizers.Encoding`
- Args:
- encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
- The list of encodings that should be merged in one
- growing_offsets (:obj:`bool`, defaults to :obj:`True`):
- Whether the offsets should accumulate while merging
- Returns:
- :class:`~tokenizers.Encoding`: The resulting Encoding
- """
- pass
- @property
- def n_sequences(self):
- """
- The number of sequences represented
- Returns:
- :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
- """
- pass
- @n_sequences.setter
- def n_sequences(self, value):
- """
- The number of sequences represented
- Returns:
- :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
- """
- pass
- @property
- def offsets(self):
- """
- The offsets associated to each token
- These offsets let's you slice the input string, and thus retrieve the original
- part that led to producing the corresponding token.
- Returns:
- A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
- """
- pass
- @offsets.setter
- def offsets(self, value):
- """
- The offsets associated to each token
- These offsets let's you slice the input string, and thus retrieve the original
- part that led to producing the corresponding token.
- Returns:
- A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
- """
- pass
- @property
- def overflowing(self):
- """
- A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
- When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
- the output into as many pieces as required to match the specified maximum length.
- This field lets you retrieve all the subsequent pieces.
- When you use pairs of sequences, the overflowing pieces will contain enough
- variations to cover all the possible combinations, while respecting the provided
- maximum length.
- """
- pass
- @overflowing.setter
- def overflowing(self, value):
- """
- A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
- When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
- the output into as many pieces as required to match the specified maximum length.
- This field lets you retrieve all the subsequent pieces.
- When you use pairs of sequences, the overflowing pieces will contain enough
- variations to cover all the possible combinations, while respecting the provided
- maximum length.
- """
- pass
- def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
- """
- Pad the :class:`~tokenizers.Encoding` at the given length
- Args:
- length (:obj:`int`):
- The desired length
- direction: (:obj:`str`, defaults to :obj:`right`):
- The expected padding direction. Can be either :obj:`right` or :obj:`left`
- pad_id (:obj:`int`, defaults to :obj:`0`):
- The ID corresponding to the padding token
- pad_type_id (:obj:`int`, defaults to :obj:`0`):
- The type ID corresponding to the padding token
- pad_token (:obj:`str`, defaults to `[PAD]`):
- The pad token to use
- """
- pass
- @property
- def sequence_ids(self):
- """
- The generated sequence indices.
- They represent the index of the input sequence associated to each token.
- The sequence id can be None if the token is not related to any input sequence,
- like for example with special tokens.
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
- """
- pass
- @sequence_ids.setter
- def sequence_ids(self, value):
- """
- The generated sequence indices.
- They represent the index of the input sequence associated to each token.
- The sequence id can be None if the token is not related to any input sequence,
- like for example with special tokens.
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
- """
- pass
- def set_sequence_id(self, sequence_id):
- """
- Set the given sequence index
- Set the given sequence index for the whole range of tokens contained in this
- :class:`~tokenizers.Encoding`.
- """
- pass
- @property
- def special_tokens_mask(self):
- """
- The special token mask
- This indicates which tokens are special tokens, and which are not.
- Returns:
- :obj:`List[int]`: The special tokens mask
- """
- pass
- @special_tokens_mask.setter
- def special_tokens_mask(self, value):
- """
- The special token mask
- This indicates which tokens are special tokens, and which are not.
- Returns:
- :obj:`List[int]`: The special tokens mask
- """
- pass
- def token_to_chars(self, token_index):
- """
- Get the offsets of the token at the given index.
- The returned offsets are related to the input sequence that contains the
- token. In order to determine in which input sequence it belongs, you
- must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
- Args:
- token_index (:obj:`int`):
- The index of a token in the encoded sequence.
- Returns:
- :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
- """
- pass
- def token_to_sequence(self, token_index):
- """
- Get the index of the sequence represented by the given token.
- In the general use case, this method returns :obj:`0` for a single sequence or
- the first sequence of a pair, and :obj:`1` for the second sequence of a pair
- Args:
- token_index (:obj:`int`):
- The index of a token in the encoded sequence.
- Returns:
- :obj:`int`: The sequence id of the given token
- """
- pass
- def token_to_word(self, token_index):
- """
- Get the index of the word that contains the token in one of the input sequences.
- The returned word index is related to the input sequence that contains
- the token. In order to determine in which input sequence it belongs, you
- must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
- Args:
- token_index (:obj:`int`):
- The index of a token in the encoded sequence.
- Returns:
- :obj:`int`: The index of the word in the relevant input sequence.
- """
- pass
- @property
- def tokens(self):
- """
- The generated tokens
- They are the string representation of the IDs.
- Returns:
- :obj:`List[str]`: The list of tokens
- """
- pass
- @tokens.setter
- def tokens(self, value):
- """
- The generated tokens
- They are the string representation of the IDs.
- Returns:
- :obj:`List[str]`: The list of tokens
- """
- pass
- def truncate(self, max_length, stride=0, direction="right"):
- """
- Truncate the :class:`~tokenizers.Encoding` at the given length
- If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
- this information is lost. It will be considered as representing a single sequence.
- Args:
- max_length (:obj:`int`):
- The desired length
- stride (:obj:`int`, defaults to :obj:`0`):
- The length of previous content to be included in each overflowing piece
- direction (:obj:`str`, defaults to :obj:`right`):
- Truncate direction
- """
- pass
- @property
- def type_ids(self):
- """
- The generated type IDs
- Generally used for tasks like sequence classification or question answering,
- these tokens let the LM know which input sequence corresponds to each tokens.
- Returns:
- :obj:`List[int]`: The list of type ids
- """
- pass
- @type_ids.setter
- def type_ids(self, value):
- """
- The generated type IDs
- Generally used for tasks like sequence classification or question answering,
- these tokens let the LM know which input sequence corresponds to each tokens.
- Returns:
- :obj:`List[int]`: The list of type ids
- """
- pass
- @property
- def word_ids(self):
- """
- The generated word indices.
- They represent the index of the word associated to each token.
- When the input is pre-tokenized, they correspond to the ID of the given input label,
- otherwise they correspond to the words indices as defined by the
- :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
- For special tokens and such (any token that was generated from something that was
- not part of the input), the output is :obj:`None`
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
- """
- pass
- @word_ids.setter
- def word_ids(self, value):
- """
- The generated word indices.
- They represent the index of the word associated to each token.
- When the input is pre-tokenized, they correspond to the ID of the given input label,
- otherwise they correspond to the words indices as defined by the
- :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
- For special tokens and such (any token that was generated from something that was
- not part of the input), the output is :obj:`None`
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
- """
- pass
- def word_to_chars(self, word_index, sequence_index=0):
- """
- Get the offsets of the word at the given index in one of the input sequences.
- Args:
- word_index (:obj:`int`):
- The index of a word in one of the input sequences.
- sequence_index (:obj:`int`, defaults to :obj:`0`):
- The index of the sequence that contains the target word
- Returns:
- :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
- """
- pass
- def word_to_tokens(self, word_index, sequence_index=0):
- """
- Get the encoded tokens corresponding to the word at the given index
- in one of the input sequences.
- Args:
- word_index (:obj:`int`):
- The index of a word in one of the input sequences.
- sequence_index (:obj:`int`, defaults to :obj:`0`):
- The index of the sequence that contains the target word
- Returns:
- :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
- """
- pass
- @property
- def words(self):
- """
- The generated word indices.
- .. warning::
- This is deprecated and will be removed in a future version.
- Please use :obj:`~tokenizers.Encoding.word_ids` instead.
- They represent the index of the word associated to each token.
- When the input is pre-tokenized, they correspond to the ID of the given input label,
- otherwise they correspond to the words indices as defined by the
- :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
- For special tokens and such (any token that was generated from something that was
- not part of the input), the output is :obj:`None`
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
- """
- pass
- @words.setter
- def words(self, value):
- """
- The generated word indices.
- .. warning::
- This is deprecated and will be removed in a future version.
- Please use :obj:`~tokenizers.Encoding.word_ids` instead.
- They represent the index of the word associated to each token.
- When the input is pre-tokenized, they correspond to the ID of the given input label,
- otherwise they correspond to the words indices as defined by the
- :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
- For special tokens and such (any token that was generated from something that was
- not part of the input), the output is :obj:`None`
- Returns:
- A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
- """
- pass
- class NormalizedString:
- """
- NormalizedString
- A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
- While making all the requested modifications, it keeps track of the alignment information
- between the two versions of the string.
- Args:
- sequence: str:
- The string sequence used to initialize this NormalizedString
- """
- def __init__(self, sequence):
- pass
- def __getitem__(self, key):
- """
- Return self[key].
- """
- pass
- def __getstate__(self, /):
- """
- Helper for pickle.
- """
- pass
- def append(self, s):
- """
- Append the given sequence to the string
- """
- pass
- def clear(self):
- """
- Clears the string
- """
- pass
- def filter(self, func):
- """
- Filter each character of the string using the given func
- """
- pass
- def for_each(self, func):
- """
- Calls the given function for each character of the string
- """
- pass
- def lowercase(self):
- """
- Lowercase the string
- """
- pass
- def lstrip(self):
- """
- Strip the left of the string
- """
- pass
- def map(self, func):
- """
- Calls the given function for each character of the string
- Replaces each character of the string using the returned value. Each
- returned value **must** be a str of length 1 (ie a character).
- """
- pass
- def nfc(self):
- """
- Runs the NFC normalization
- """
- pass
- def nfd(self):
- """
- Runs the NFD normalization
- """
- pass
- def nfkc(self):
- """
- Runs the NFKC normalization
- """
- pass
- def nfkd(self):
- """
- Runs the NFKD normalization
- """
- pass
- @property
- def normalized(self):
- """
- The normalized part of the string
- """
- pass
- @normalized.setter
- def normalized(self, value):
- """
- The normalized part of the string
- """
- pass
- @property
- def original(self):
- """ """
- pass
- @original.setter
- def original(self, value):
- """ """
- pass
- def prepend(self, s):
- """
- Prepend the given sequence to the string
- """
- pass
- def replace(self, pattern, content):
- """
- Replace the content of the given pattern with the provided content
- Args:
- pattern: Pattern:
- A pattern used to match the string. Usually a string or a Regex
- content: str:
- The content to be used as replacement
- """
- pass
- def rstrip(self):
- """
- Strip the right of the string
- """
- pass
- def slice(self, range):
- """
- Slice the string using the given range
- """
- pass
- def split(self, pattern, behavior):
- """
- Split the NormalizedString using the given pattern and the specified behavior
- Args:
- pattern: Pattern:
- A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
- behavior: SplitDelimiterBehavior:
- The behavior to use when splitting.
- Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
- "contiguous"
- Returns:
- A list of NormalizedString, representing each split
- """
- pass
- def strip(self):
- """
- Strip both ends of the string
- """
- pass
- def uppercase(self):
- """
- Uppercase the string
- """
- pass
- class PreTokenizedString:
- """
- PreTokenizedString
- Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
- underlying string, while keeping track of the alignment information (offsets).
- The PreTokenizedString manages what we call `splits`. Each split represents a substring
- which is a subpart of the original string, with the relevant offsets and tokens.
- When calling one of the methods used to modify the PreTokenizedString (namely one of
- `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
- tokens will get modified.
- Args:
- sequence: str:
- The string sequence used to initialize this PreTokenizedString
- """
- def __init__(self, sequence):
- pass
- def __getstate__(self, /):
- """
- Helper for pickle.
- """
- pass
- def get_splits(self, offset_referential="original", offset_type="char"):
- """
- Get the splits currently managed by the PreTokenizedString
- Args:
- offset_referential: :obj:`str`
- Whether the returned splits should have offsets expressed relative
- to the original string, or the normalized one. choices: "original", "normalized".
- offset_type: :obj:`str`
- Whether the returned splits should have offsets expressed in bytes or chars.
- When slicing an str, we usually want to use chars, which is the default value.
- Now in some cases it might be interesting to get these offsets expressed in bytes,
- so it is possible to change this here.
- choices: "char", "bytes"
- Returns
- A list of splits
- """
- pass
- def normalize(self, func):
- """
- Normalize each split of the `PreTokenizedString` using the given `func`
- Args:
- func: Callable[[NormalizedString], None]:
- The function used to normalize each underlying split. This function
- does not need to return anything, just calling the methods on the provided
- NormalizedString allow its modification.
- """
- pass
- def split(self, func):
- """
- Split the PreTokenizedString using the given `func`
- Args:
- func: Callable[[index, NormalizedString], List[NormalizedString]]:
- The function used to split each underlying split.
- It is expected to return a list of `NormalizedString`, that represent the new
- splits. If the given `NormalizedString` does not need any splitting, we can
- just return it directly.
- In order for the offsets to be tracked accurately, any returned `NormalizedString`
- should come from calling either `.split` or `.slice` on the received one.
- """
- pass
- def to_encoding(self, type_id=0, word_idx=None):
- """
- Return an Encoding generated from this PreTokenizedString
- Args:
- type_id: int = 0:
- The type_id to be used on the generated Encoding.
- word_idx: Optional[int] = None:
- An optional word index to be used for each token of this Encoding. If provided,
- all the word indices in the generated Encoding will use this value, instead
- of the one automatically tracked during pre-tokenization.
- Returns:
- An Encoding
- """
- pass
- def tokenize(self, func):
- """
- Tokenize each split of the `PreTokenizedString` using the given `func`
- Args:
- func: Callable[[str], List[Token]]:
- The function used to tokenize each underlying split. This function must return
- a list of Token generated from the input str.
- """
- pass
- class Regex:
- """
- Instantiate a new Regex with the given pattern
- """
- def __init__(self, pattern):
- pass
- def __getstate__(self, /):
- """
- Helper for pickle.
- """
- pass
- class Token:
- def __init__(self, id, value, offsets):
- pass
- def __getstate__(self, /):
- """
- Helper for pickle.
- """
- pass
- def as_tuple(self):
- """ """
- pass
- @property
- def id(self):
- """ """
- pass
- @id.setter
- def id(self, value):
- """ """
- pass
- @property
- def offsets(self):
- """ """
- pass
- @offsets.setter
- def offsets(self, value):
- """ """
- pass
- @property
- def value(self):
- """ """
- pass
- @value.setter
- def value(self, value):
- """ """
- pass
- class Tokenizer:
- """
- A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
- and outputs an :class:`~tokenizers.Encoding`.
- Args:
- model (:class:`~tokenizers.models.Model`):
- The core algorithm that this :obj:`Tokenizer` should be using.
- """
- def __init__(self, model):
- pass
- def __getnewargs__(self):
- """ """
- pass
- def __getstate__(self):
- """ """
- pass
- def __setstate__(self, state):
- """ """
- pass
- def add_special_tokens(self, tokens):
- """
- Add the given special tokens to the Tokenizer.
- If these tokens are already part of the vocabulary, it just let the Tokenizer know about
- them. If they don't exist, the Tokenizer creates them, giving them a new id.
- These special tokens will never be processed by the model (ie won't be split into
- multiple tokens), and they can be removed from the output when decoding.
- Args:
- tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
- The list of special tokens we want to add to the vocabulary. Each token can either
- be a string or an instance of :class:`~tokenizers.AddedToken` for more
- customization.
- Returns:
- :obj:`int`: The number of tokens that were created in the vocabulary
- """
- pass
- def add_tokens(self, tokens):
- """
- Add the given tokens to the vocabulary
- The given tokens are added only if they don't already exist in the vocabulary.
- Each token then gets a new attributed id.
- Args:
- tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
- The list of tokens we want to add to the vocabulary. Each token can be either a
- string or an instance of :class:`~tokenizers.AddedToken` for more customization.
- Returns:
- :obj:`int`: The number of tokens that were created in the vocabulary
- """
- pass
- def async_decode_batch(self, sequences, skip_special_tokens=True):
- """
- Decode a batch of ids back to their corresponding string
- Args:
- sequences (:obj:`List` of :obj:`List[int]`):
- The batch of sequences we want to decode
- skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether the special tokens should be removed from the decoded strings
- Returns:
- :obj:`List[str]`: A list of decoded strings
- """
- pass
- def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
- """
- Asynchronously encode the given input with character offsets.
- This is an async version of encode that can be awaited in async Python code.
- Example:
- Here are some examples of the inputs that are accepted::
- await async_encode("A single sequence")
- Args:
- sequence (:obj:`~tokenizers.InputSequence`):
- The main input sequence we want to encode. This sequence can be either raw
- text or pre-tokenized, according to the ``is_pretokenized`` argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
- pair (:obj:`~tokenizers.InputSequence`, `optional`):
- An optional input sequence. The expected format is the same that for ``sequence``.
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- :class:`~tokenizers.Encoding`: The encoded result
- """
- pass
- def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
- """
- Asynchronously encode the given batch of inputs with character offsets.
- This is an async version of encode_batch that can be awaited in async Python code.
- Example:
- Here are some examples of the inputs that are accepted::
- await async_encode_batch([
- "A single sequence",
- ("A tuple with a sequence", "And its pair"),
- [ "A", "pre", "tokenized", "sequence" ],
- ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
- ])
- Args:
- input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
- A list of single sequences or pair sequences to encode. Each sequence
- can be either raw text or pre-tokenized, according to the ``is_pretokenized``
- argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
- """
- pass
- def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
- """
- Asynchronously encode the given batch of inputs without tracking character offsets.
- This is an async version of encode_batch_fast that can be awaited in async Python code.
- Example:
- Here are some examples of the inputs that are accepted::
- await async_encode_batch_fast([
- "A single sequence",
- ("A tuple with a sequence", "And its pair"),
- [ "A", "pre", "tokenized", "sequence" ],
- ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
- ])
- Args:
- input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
- A list of single sequences or pair sequences to encode. Each sequence
- can be either raw text or pre-tokenized, according to the ``is_pretokenized``
- argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
- """
- pass
- def decode(self, ids, skip_special_tokens=True):
- """
- Decode the given list of ids back to a string
- This is used to decode anything coming back from a Language Model
- Args:
- ids (A :obj:`List/Tuple` of :obj:`int`):
- The list of ids that we want to decode
- skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether the special tokens should be removed from the decoded string
- Returns:
- :obj:`str`: The decoded string
- """
- pass
- def decode_batch(self, sequences, skip_special_tokens=True):
- """
- Decode a batch of ids back to their corresponding string
- Args:
- sequences (:obj:`List` of :obj:`List[int]`):
- The batch of sequences we want to decode
- skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether the special tokens should be removed from the decoded strings
- Returns:
- :obj:`List[str]`: A list of decoded strings
- """
- pass
- @property
- def decoder(self):
- """
- The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
- """
- pass
- @decoder.setter
- def decoder(self, value):
- """
- The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
- """
- pass
- def enable_padding(
- self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
- ):
- """
- Enable the padding
- Args:
- direction (:obj:`str`, `optional`, defaults to :obj:`right`):
- The direction in which to pad. Can be either ``right`` or ``left``
- pad_to_multiple_of (:obj:`int`, `optional`):
- If specified, the padding length should always snap to the next multiple of the
- given value. For example if we were going to pad witha length of 250 but
- ``pad_to_multiple_of=8`` then we will pad to 256.
- pad_id (:obj:`int`, defaults to 0):
- The id to be used when padding
- pad_type_id (:obj:`int`, defaults to 0):
- The type id to be used when padding
- pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
- The pad token to be used when padding
- length (:obj:`int`, `optional`):
- If specified, the length at which to pad. If not specified we pad using the size of
- the longest sequence in a batch.
- """
- pass
- def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
- """
- Enable truncation
- Args:
- max_length (:obj:`int`):
- The max length at which to truncate
- stride (:obj:`int`, `optional`):
- The length of the previous first sequence to be included in the overflowing
- sequence
- strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
- The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
- ``only_second``.
- direction (:obj:`str`, defaults to :obj:`right`):
- Truncate direction
- """
- pass
- def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
- """
- Encode the given sequence and pair. This method can process raw text sequences
- as well as already pre-tokenized sequences.
- Example:
- Here are some examples of the inputs that are accepted::
- encode("A single sequence")`
- encode("A sequence", "And its pair")`
- encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
- encode(
- [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
- is_pretokenized=True
- )
- Args:
- sequence (:obj:`~tokenizers.InputSequence`):
- The main input sequence we want to encode. This sequence can be either raw
- text or pre-tokenized, according to the ``is_pretokenized`` argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
- pair (:obj:`~tokenizers.InputSequence`, `optional`):
- An optional input sequence. The expected format is the same that for ``sequence``.
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- :class:`~tokenizers.Encoding`: The encoded result
- """
- pass
- def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
- """
- Encode the given batch of inputs. This method accept both raw text sequences
- as well as already pre-tokenized sequences. The reason we use `PySequence` is
- because it allows type checking with zero-cost (according to PyO3) as we don't
- have to convert to check.
- Example:
- Here are some examples of the inputs that are accepted::
- encode_batch([
- "A single sequence",
- ("A tuple with a sequence", "And its pair"),
- [ "A", "pre", "tokenized", "sequence" ],
- ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
- ])
- Args:
- input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
- A list of single sequences or pair sequences to encode. Each sequence
- can be either raw text or pre-tokenized, according to the ``is_pretokenized``
- argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
- """
- pass
- def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
- """
- Encode the given batch of inputs. This method is faster than `encode_batch`
- because it doesn't keep track of offsets, they will be all zeros.
- Example:
- Here are some examples of the inputs that are accepted::
- encode_batch_fast([
- "A single sequence",
- ("A tuple with a sequence", "And its pair"),
- [ "A", "pre", "tokenized", "sequence" ],
- ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
- ])
- Args:
- input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
- A list of single sequences or pair sequences to encode. Each sequence
- can be either raw text or pre-tokenized, according to the ``is_pretokenized``
- argument:
- - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
- is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- Whether the input is already pre-tokenized
- add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to add the special tokens
- Returns:
- A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
- """
- pass
- @property
- def encode_special_tokens(self):
- """
- Modifies the tokenizer in order to use or not the special tokens
- during encoding.
- Args:
- value (:obj:`bool`):
- Whether to use the special tokens or not
- """
- pass
- @encode_special_tokens.setter
- def encode_special_tokens(self, value):
- """
- Modifies the tokenizer in order to use or not the special tokens
- during encoding.
- Args:
- value (:obj:`bool`):
- Whether to use the special tokens or not
- """
- pass
- @staticmethod
- def from_buffer(buffer):
- """
- Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
- Args:
- buffer (:obj:`bytes`):
- A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
- Returns:
- :class:`~tokenizers.Tokenizer`: The new tokenizer
- """
- pass
- @staticmethod
- def from_file(path):
- """
- Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
- Args:
- path (:obj:`str`):
- A path to a local JSON file representing a previously serialized
- :class:`~tokenizers.Tokenizer`
- Returns:
- :class:`~tokenizers.Tokenizer`: The new tokenizer
- """
- pass
- @staticmethod
- def from_pretrained(identifier, revision="main", token=None):
- """
- Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
- Hugging Face Hub.
- Args:
- identifier (:obj:`str`):
- The identifier of a Model on the Hugging Face Hub, that contains
- a tokenizer.json file
- revision (:obj:`str`, defaults to `main`):
- A branch or commit id
- token (:obj:`str`, `optional`, defaults to `None`):
- An optional auth token used to access private repositories on the
- Hugging Face Hub
- Returns:
- :class:`~tokenizers.Tokenizer`: The new tokenizer
- """
- pass
- @staticmethod
- def from_str(json):
- """
- Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
- Args:
- json (:obj:`str`):
- A valid JSON string representing a previously serialized
- :class:`~tokenizers.Tokenizer`
- Returns:
- :class:`~tokenizers.Tokenizer`: The new tokenizer
- """
- pass
- def get_added_tokens_decoder(self):
- """
- Get the underlying vocabulary
- Returns:
- :obj:`Dict[int, AddedToken]`: The vocabulary
- """
- pass
- def get_vocab(self, with_added_tokens=True):
- """
- Get the underlying vocabulary
- Args:
- with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to include the added tokens
- Returns:
- :obj:`Dict[str, int]`: The vocabulary
- """
- pass
- def get_vocab_size(self, with_added_tokens=True):
- """
- Get the size of the underlying vocabulary
- Args:
- with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
- Whether to include the added tokens
- Returns:
- :obj:`int`: The size of the vocabulary
- """
- pass
- def id_to_token(self, id):
- """
- Convert the given id to its corresponding token if it exists
- Args:
- id (:obj:`int`):
- The id to convert
- Returns:
- :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
- """
- pass
- @property
- def model(self):
- """
- The :class:`~tokenizers.models.Model` in use by the Tokenizer
- """
- pass
- @model.setter
- def model(self, value):
- """
- The :class:`~tokenizers.models.Model` in use by the Tokenizer
- """
- pass
- def no_padding(self):
- """
- Disable padding
- """
- pass
- def no_truncation(self):
- """
- Disable truncation
- """
- pass
- @property
- def normalizer(self):
- """
- The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
- """
- pass
- @normalizer.setter
- def normalizer(self, value):
- """
- The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
- """
- pass
- def num_special_tokens_to_add(self, is_pair):
- """
- Return the number of special tokens that would be added for single/pair sentences.
- :param is_pair: Boolean indicating if the input would be a single sentence or a pair
- :return:
- """
- pass
- @property
- def padding(self):
- """
- Get the current padding parameters
- `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
- Returns:
- (:obj:`dict`, `optional`):
- A dict with the current padding parameters if padding is enabled
- """
- pass
- @padding.setter
- def padding(self, value):
- """
- Get the current padding parameters
- `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
- Returns:
- (:obj:`dict`, `optional`):
- A dict with the current padding parameters if padding is enabled
- """
- pass
- def post_process(self, encoding, pair=None, add_special_tokens=True):
- """
- Apply all the post-processing steps to the given encodings.
- The various steps are:
- 1. Truncate according to the set truncation params (provided with
- :meth:`~tokenizers.Tokenizer.enable_truncation`)
- 2. Apply the :class:`~tokenizers.processors.PostProcessor`
- 3. Pad according to the set padding params (provided with
- :meth:`~tokenizers.Tokenizer.enable_padding`)
- Args:
- encoding (:class:`~tokenizers.Encoding`):
- The :class:`~tokenizers.Encoding` corresponding to the main sequence.
- pair (:class:`~tokenizers.Encoding`, `optional`):
- An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
- add_special_tokens (:obj:`bool`):
- Whether to add the special tokens
- Returns:
- :class:`~tokenizers.Encoding`: The final post-processed encoding
- """
- pass
- @property
- def post_processor(self):
- """
- The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
- """
- pass
- @post_processor.setter
- def post_processor(self, value):
- """
- The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
- """
- pass
- @property
- def pre_tokenizer(self):
- """
- The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
- """
- pass
- @pre_tokenizer.setter
- def pre_tokenizer(self, value):
- """
- The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
- """
- pass
- def save(self, path, pretty=True):
- """
- Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
- Args:
- path (:obj:`str`):
- A path to a file in which to save the serialized tokenizer.
- pretty (:obj:`bool`, defaults to :obj:`True`):
- Whether the JSON file should be pretty formatted.
- """
- pass
- def to_str(self, pretty=False):
- """
- Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
- Args:
- pretty (:obj:`bool`, defaults to :obj:`False`):
- Whether the JSON string should be pretty formatted.
- Returns:
- :obj:`str`: A string representing the serialized Tokenizer
- """
- pass
- def token_to_id(self, token):
- """
- Convert the given token to its corresponding id if it exists
- Args:
- token (:obj:`str`):
- The token to convert
- Returns:
- :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
- """
- pass
- def train(self, files, trainer=None):
- """
- Train the Tokenizer using the given files.
- Reads the files line by line, while keeping all the whitespace, even new lines.
- If you want to train from data store in-memory, you can check
- :meth:`~tokenizers.Tokenizer.train_from_iterator`
- Args:
- files (:obj:`List[str]`):
- A list of path to the files that we should use for training
- trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
- An optional trainer that should be used to train our Model
- """
- pass
- def train_from_iterator(self, iterator, trainer=None, length=None):
- """
- Train the Tokenizer using the provided iterator.
- You can provide anything that is a Python Iterator
- * A list of sequences :obj:`List[str]`
- * A generator that yields :obj:`str` or :obj:`List[str]`
- * A Numpy array of strings
- * ...
- Args:
- iterator (:obj:`Iterator`):
- Any iterator over strings or list of strings
- trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
- An optional trainer that should be used to train our Model
- length (:obj:`int`, `optional`):
- The total number of sequences in the iterator. This is used to
- provide meaningful progress tracking
- """
- pass
- @property
- def truncation(self):
- """
- Get the currently set truncation parameters
- `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
- Returns:
- (:obj:`dict`, `optional`):
- A dict with the current truncation parameters if truncation is enabled
- """
- pass
- @truncation.setter
- def truncation(self, value):
- """
- Get the currently set truncation parameters
- `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
- Returns:
- (:obj:`dict`, `optional`):
- A dict with the current truncation parameters if truncation is enabled
- """
- pass
- from enum import Enum
- from typing import List, Tuple, Union, Any
- Offsets = Tuple[int, int]
- TextInputSequence = str
- PreTokenizedInputSequence = Union[List[str], Tuple[str, ...]]
- TextEncodeInput = Union[
- TextInputSequence,
- Tuple[TextInputSequence, TextInputSequence],
- List[TextInputSequence],
- ]
- PreTokenizedEncodeInput = Union[
- PreTokenizedInputSequence,
- Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
- List[PreTokenizedInputSequence],
- ]
- InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
- EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
- class OffsetReferential(Enum):
- ORIGINAL = "original"
- NORMALIZED = "normalized"
- class OffsetType(Enum):
- BYTE = "byte"
- CHAR = "char"
- class SplitDelimiterBehavior(Enum):
- REMOVED = "removed"
- ISOLATED = "isolated"
- MERGED_WITH_PREVIOUS = "merged_with_previous"
- MERGED_WITH_NEXT = "merged_with_next"
- CONTIGUOUS = "contiguous"
- from .implementations import (
- BertWordPieceTokenizer,
- ByteLevelBPETokenizer,
- CharBPETokenizer,
- SentencePieceBPETokenizer,
- SentencePieceUnigramTokenizer,
- )
- def __getattr__(name: str) -> Any: ...
- BertWordPieceTokenizer: Any
- ByteLevelBPETokenizer: Any
- CharBPETokenizer: Any
- SentencePieceBPETokenizer: Any
- SentencePieceUnigramTokenizer: Any
|