__init__.pyi 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800
  1. # Generated content DO NOT EDIT
  2. class AddedToken:
  3. """
  4. Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
  5. It can have special options that defines the way it should behave.
  6. Args:
  7. content (:obj:`str`): The content of the token
  8. single_word (:obj:`bool`, defaults to :obj:`False`):
  9. Defines whether this token should only match single words. If :obj:`True`, this
  10. token will never match inside of a word. For example the token ``ing`` would match
  11. on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
  12. The notion of "`inside of a word`" is defined by the word boundaries pattern in
  13. regular expressions (ie. the token should start and end with word boundaries).
  14. lstrip (:obj:`bool`, defaults to :obj:`False`):
  15. Defines whether this token should strip all potential whitespaces on its left side.
  16. If :obj:`True`, this token will greedily match any whitespace on its left. For
  17. example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
  18. ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
  19. rstrip (:obj:`bool`, defaults to :obj:`False`):
  20. Defines whether this token should strip all potential whitespaces on its right
  21. side. If :obj:`True`, this token will greedily match any whitespace on its right.
  22. It works just like :obj:`lstrip` but on the right.
  23. normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
  24. Defines whether this token should match against the normalized version of the input
  25. text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
  26. lowercasing the text, the token could be extract from the input ``"I saw a lion
  27. Yesterday"``.
  28. special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
  29. Defines whether this token should be skipped when decoding.
  30. """
  31. def __init__(self, content=None, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
  32. pass
  33. def __getstate__(self):
  34. """ """
  35. pass
  36. def __setstate__(self, state):
  37. """ """
  38. pass
  39. @property
  40. def content(self):
  41. """
  42. Get the content of this :obj:`AddedToken`
  43. """
  44. pass
  45. @content.setter
  46. def content(self, value):
  47. """
  48. Get the content of this :obj:`AddedToken`
  49. """
  50. pass
  51. @property
  52. def lstrip(self):
  53. """
  54. Get the value of the :obj:`lstrip` option
  55. """
  56. pass
  57. @lstrip.setter
  58. def lstrip(self, value):
  59. """
  60. Get the value of the :obj:`lstrip` option
  61. """
  62. pass
  63. @property
  64. def normalized(self):
  65. """
  66. Get the value of the :obj:`normalized` option
  67. """
  68. pass
  69. @normalized.setter
  70. def normalized(self, value):
  71. """
  72. Get the value of the :obj:`normalized` option
  73. """
  74. pass
  75. @property
  76. def rstrip(self):
  77. """
  78. Get the value of the :obj:`rstrip` option
  79. """
  80. pass
  81. @rstrip.setter
  82. def rstrip(self, value):
  83. """
  84. Get the value of the :obj:`rstrip` option
  85. """
  86. pass
  87. @property
  88. def single_word(self):
  89. """
  90. Get the value of the :obj:`single_word` option
  91. """
  92. pass
  93. @single_word.setter
  94. def single_word(self, value):
  95. """
  96. Get the value of the :obj:`single_word` option
  97. """
  98. pass
  99. @property
  100. def special(self):
  101. """
  102. Get the value of the :obj:`special` option
  103. """
  104. pass
  105. @special.setter
  106. def special(self, value):
  107. """
  108. Get the value of the :obj:`special` option
  109. """
  110. pass
  111. class Encoding:
  112. """
  113. The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
  114. """
  115. def __init__(self):
  116. pass
  117. def __getstate__(self):
  118. """ """
  119. pass
  120. def __setstate__(self, state):
  121. """ """
  122. pass
  123. @property
  124. def attention_mask(self):
  125. """
  126. The attention mask
  127. This indicates to the LM which tokens should be attended to, and which should not.
  128. This is especially important when batching sequences, where we need to applying
  129. padding.
  130. Returns:
  131. :obj:`List[int]`: The attention mask
  132. """
  133. pass
  134. @attention_mask.setter
  135. def attention_mask(self, value):
  136. """
  137. The attention mask
  138. This indicates to the LM which tokens should be attended to, and which should not.
  139. This is especially important when batching sequences, where we need to applying
  140. padding.
  141. Returns:
  142. :obj:`List[int]`: The attention mask
  143. """
  144. pass
  145. def char_to_token(self, char_pos, sequence_index=0):
  146. """
  147. Get the token that contains the char at the given position in the input sequence.
  148. Args:
  149. char_pos (:obj:`int`):
  150. The position of a char in the input string
  151. sequence_index (:obj:`int`, defaults to :obj:`0`):
  152. The index of the sequence that contains the target char
  153. Returns:
  154. :obj:`int`: The index of the token that contains this char in the encoded sequence
  155. """
  156. pass
  157. def char_to_word(self, char_pos, sequence_index=0):
  158. """
  159. Get the word that contains the char at the given position in the input sequence.
  160. Args:
  161. char_pos (:obj:`int`):
  162. The position of a char in the input string
  163. sequence_index (:obj:`int`, defaults to :obj:`0`):
  164. The index of the sequence that contains the target char
  165. Returns:
  166. :obj:`int`: The index of the word that contains this char in the input sequence
  167. """
  168. pass
  169. @property
  170. def ids(self):
  171. """
  172. The generated IDs
  173. The IDs are the main input to a Language Model. They are the token indices,
  174. the numerical representations that a LM understands.
  175. Returns:
  176. :obj:`List[int]`: The list of IDs
  177. """
  178. pass
  179. @ids.setter
  180. def ids(self, value):
  181. """
  182. The generated IDs
  183. The IDs are the main input to a Language Model. They are the token indices,
  184. the numerical representations that a LM understands.
  185. Returns:
  186. :obj:`List[int]`: The list of IDs
  187. """
  188. pass
  189. @staticmethod
  190. def merge(encodings, growing_offsets=True):
  191. """
  192. Merge the list of encodings into one final :class:`~tokenizers.Encoding`
  193. Args:
  194. encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
  195. The list of encodings that should be merged in one
  196. growing_offsets (:obj:`bool`, defaults to :obj:`True`):
  197. Whether the offsets should accumulate while merging
  198. Returns:
  199. :class:`~tokenizers.Encoding`: The resulting Encoding
  200. """
  201. pass
  202. @property
  203. def n_sequences(self):
  204. """
  205. The number of sequences represented
  206. Returns:
  207. :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
  208. """
  209. pass
  210. @n_sequences.setter
  211. def n_sequences(self, value):
  212. """
  213. The number of sequences represented
  214. Returns:
  215. :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
  216. """
  217. pass
  218. @property
  219. def offsets(self):
  220. """
  221. The offsets associated to each token
  222. These offsets let's you slice the input string, and thus retrieve the original
  223. part that led to producing the corresponding token.
  224. Returns:
  225. A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
  226. """
  227. pass
  228. @offsets.setter
  229. def offsets(self, value):
  230. """
  231. The offsets associated to each token
  232. These offsets let's you slice the input string, and thus retrieve the original
  233. part that led to producing the corresponding token.
  234. Returns:
  235. A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
  236. """
  237. pass
  238. @property
  239. def overflowing(self):
  240. """
  241. A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
  242. When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
  243. the output into as many pieces as required to match the specified maximum length.
  244. This field lets you retrieve all the subsequent pieces.
  245. When you use pairs of sequences, the overflowing pieces will contain enough
  246. variations to cover all the possible combinations, while respecting the provided
  247. maximum length.
  248. """
  249. pass
  250. @overflowing.setter
  251. def overflowing(self, value):
  252. """
  253. A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
  254. When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
  255. the output into as many pieces as required to match the specified maximum length.
  256. This field lets you retrieve all the subsequent pieces.
  257. When you use pairs of sequences, the overflowing pieces will contain enough
  258. variations to cover all the possible combinations, while respecting the provided
  259. maximum length.
  260. """
  261. pass
  262. def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
  263. """
  264. Pad the :class:`~tokenizers.Encoding` at the given length
  265. Args:
  266. length (:obj:`int`):
  267. The desired length
  268. direction: (:obj:`str`, defaults to :obj:`right`):
  269. The expected padding direction. Can be either :obj:`right` or :obj:`left`
  270. pad_id (:obj:`int`, defaults to :obj:`0`):
  271. The ID corresponding to the padding token
  272. pad_type_id (:obj:`int`, defaults to :obj:`0`):
  273. The type ID corresponding to the padding token
  274. pad_token (:obj:`str`, defaults to `[PAD]`):
  275. The pad token to use
  276. """
  277. pass
  278. @property
  279. def sequence_ids(self):
  280. """
  281. The generated sequence indices.
  282. They represent the index of the input sequence associated to each token.
  283. The sequence id can be None if the token is not related to any input sequence,
  284. like for example with special tokens.
  285. Returns:
  286. A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
  287. """
  288. pass
  289. @sequence_ids.setter
  290. def sequence_ids(self, value):
  291. """
  292. The generated sequence indices.
  293. They represent the index of the input sequence associated to each token.
  294. The sequence id can be None if the token is not related to any input sequence,
  295. like for example with special tokens.
  296. Returns:
  297. A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
  298. """
  299. pass
  300. def set_sequence_id(self, sequence_id):
  301. """
  302. Set the given sequence index
  303. Set the given sequence index for the whole range of tokens contained in this
  304. :class:`~tokenizers.Encoding`.
  305. """
  306. pass
  307. @property
  308. def special_tokens_mask(self):
  309. """
  310. The special token mask
  311. This indicates which tokens are special tokens, and which are not.
  312. Returns:
  313. :obj:`List[int]`: The special tokens mask
  314. """
  315. pass
  316. @special_tokens_mask.setter
  317. def special_tokens_mask(self, value):
  318. """
  319. The special token mask
  320. This indicates which tokens are special tokens, and which are not.
  321. Returns:
  322. :obj:`List[int]`: The special tokens mask
  323. """
  324. pass
  325. def token_to_chars(self, token_index):
  326. """
  327. Get the offsets of the token at the given index.
  328. The returned offsets are related to the input sequence that contains the
  329. token. In order to determine in which input sequence it belongs, you
  330. must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
  331. Args:
  332. token_index (:obj:`int`):
  333. The index of a token in the encoded sequence.
  334. Returns:
  335. :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
  336. """
  337. pass
  338. def token_to_sequence(self, token_index):
  339. """
  340. Get the index of the sequence represented by the given token.
  341. In the general use case, this method returns :obj:`0` for a single sequence or
  342. the first sequence of a pair, and :obj:`1` for the second sequence of a pair
  343. Args:
  344. token_index (:obj:`int`):
  345. The index of a token in the encoded sequence.
  346. Returns:
  347. :obj:`int`: The sequence id of the given token
  348. """
  349. pass
  350. def token_to_word(self, token_index):
  351. """
  352. Get the index of the word that contains the token in one of the input sequences.
  353. The returned word index is related to the input sequence that contains
  354. the token. In order to determine in which input sequence it belongs, you
  355. must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
  356. Args:
  357. token_index (:obj:`int`):
  358. The index of a token in the encoded sequence.
  359. Returns:
  360. :obj:`int`: The index of the word in the relevant input sequence.
  361. """
  362. pass
  363. @property
  364. def tokens(self):
  365. """
  366. The generated tokens
  367. They are the string representation of the IDs.
  368. Returns:
  369. :obj:`List[str]`: The list of tokens
  370. """
  371. pass
  372. @tokens.setter
  373. def tokens(self, value):
  374. """
  375. The generated tokens
  376. They are the string representation of the IDs.
  377. Returns:
  378. :obj:`List[str]`: The list of tokens
  379. """
  380. pass
  381. def truncate(self, max_length, stride=0, direction="right"):
  382. """
  383. Truncate the :class:`~tokenizers.Encoding` at the given length
  384. If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
  385. this information is lost. It will be considered as representing a single sequence.
  386. Args:
  387. max_length (:obj:`int`):
  388. The desired length
  389. stride (:obj:`int`, defaults to :obj:`0`):
  390. The length of previous content to be included in each overflowing piece
  391. direction (:obj:`str`, defaults to :obj:`right`):
  392. Truncate direction
  393. """
  394. pass
  395. @property
  396. def type_ids(self):
  397. """
  398. The generated type IDs
  399. Generally used for tasks like sequence classification or question answering,
  400. these tokens let the LM know which input sequence corresponds to each tokens.
  401. Returns:
  402. :obj:`List[int]`: The list of type ids
  403. """
  404. pass
  405. @type_ids.setter
  406. def type_ids(self, value):
  407. """
  408. The generated type IDs
  409. Generally used for tasks like sequence classification or question answering,
  410. these tokens let the LM know which input sequence corresponds to each tokens.
  411. Returns:
  412. :obj:`List[int]`: The list of type ids
  413. """
  414. pass
  415. @property
  416. def word_ids(self):
  417. """
  418. The generated word indices.
  419. They represent the index of the word associated to each token.
  420. When the input is pre-tokenized, they correspond to the ID of the given input label,
  421. otherwise they correspond to the words indices as defined by the
  422. :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
  423. For special tokens and such (any token that was generated from something that was
  424. not part of the input), the output is :obj:`None`
  425. Returns:
  426. A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
  427. """
  428. pass
  429. @word_ids.setter
  430. def word_ids(self, value):
  431. """
  432. The generated word indices.
  433. They represent the index of the word associated to each token.
  434. When the input is pre-tokenized, they correspond to the ID of the given input label,
  435. otherwise they correspond to the words indices as defined by the
  436. :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
  437. For special tokens and such (any token that was generated from something that was
  438. not part of the input), the output is :obj:`None`
  439. Returns:
  440. A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
  441. """
  442. pass
  443. def word_to_chars(self, word_index, sequence_index=0):
  444. """
  445. Get the offsets of the word at the given index in one of the input sequences.
  446. Args:
  447. word_index (:obj:`int`):
  448. The index of a word in one of the input sequences.
  449. sequence_index (:obj:`int`, defaults to :obj:`0`):
  450. The index of the sequence that contains the target word
  451. Returns:
  452. :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
  453. """
  454. pass
  455. def word_to_tokens(self, word_index, sequence_index=0):
  456. """
  457. Get the encoded tokens corresponding to the word at the given index
  458. in one of the input sequences.
  459. Args:
  460. word_index (:obj:`int`):
  461. The index of a word in one of the input sequences.
  462. sequence_index (:obj:`int`, defaults to :obj:`0`):
  463. The index of the sequence that contains the target word
  464. Returns:
  465. :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
  466. """
  467. pass
  468. @property
  469. def words(self):
  470. """
  471. The generated word indices.
  472. .. warning::
  473. This is deprecated and will be removed in a future version.
  474. Please use :obj:`~tokenizers.Encoding.word_ids` instead.
  475. They represent the index of the word associated to each token.
  476. When the input is pre-tokenized, they correspond to the ID of the given input label,
  477. otherwise they correspond to the words indices as defined by the
  478. :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
  479. For special tokens and such (any token that was generated from something that was
  480. not part of the input), the output is :obj:`None`
  481. Returns:
  482. A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
  483. """
  484. pass
  485. @words.setter
  486. def words(self, value):
  487. """
  488. The generated word indices.
  489. .. warning::
  490. This is deprecated and will be removed in a future version.
  491. Please use :obj:`~tokenizers.Encoding.word_ids` instead.
  492. They represent the index of the word associated to each token.
  493. When the input is pre-tokenized, they correspond to the ID of the given input label,
  494. otherwise they correspond to the words indices as defined by the
  495. :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
  496. For special tokens and such (any token that was generated from something that was
  497. not part of the input), the output is :obj:`None`
  498. Returns:
  499. A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
  500. """
  501. pass
  502. class NormalizedString:
  503. """
  504. NormalizedString
  505. A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
  506. While making all the requested modifications, it keeps track of the alignment information
  507. between the two versions of the string.
  508. Args:
  509. sequence: str:
  510. The string sequence used to initialize this NormalizedString
  511. """
  512. def __init__(self, sequence):
  513. pass
  514. def __getitem__(self, key):
  515. """
  516. Return self[key].
  517. """
  518. pass
  519. def __getstate__(self, /):
  520. """
  521. Helper for pickle.
  522. """
  523. pass
  524. def append(self, s):
  525. """
  526. Append the given sequence to the string
  527. """
  528. pass
  529. def clear(self):
  530. """
  531. Clears the string
  532. """
  533. pass
  534. def filter(self, func):
  535. """
  536. Filter each character of the string using the given func
  537. """
  538. pass
  539. def for_each(self, func):
  540. """
  541. Calls the given function for each character of the string
  542. """
  543. pass
  544. def lowercase(self):
  545. """
  546. Lowercase the string
  547. """
  548. pass
  549. def lstrip(self):
  550. """
  551. Strip the left of the string
  552. """
  553. pass
  554. def map(self, func):
  555. """
  556. Calls the given function for each character of the string
  557. Replaces each character of the string using the returned value. Each
  558. returned value **must** be a str of length 1 (ie a character).
  559. """
  560. pass
  561. def nfc(self):
  562. """
  563. Runs the NFC normalization
  564. """
  565. pass
  566. def nfd(self):
  567. """
  568. Runs the NFD normalization
  569. """
  570. pass
  571. def nfkc(self):
  572. """
  573. Runs the NFKC normalization
  574. """
  575. pass
  576. def nfkd(self):
  577. """
  578. Runs the NFKD normalization
  579. """
  580. pass
  581. @property
  582. def normalized(self):
  583. """
  584. The normalized part of the string
  585. """
  586. pass
  587. @normalized.setter
  588. def normalized(self, value):
  589. """
  590. The normalized part of the string
  591. """
  592. pass
  593. @property
  594. def original(self):
  595. """ """
  596. pass
  597. @original.setter
  598. def original(self, value):
  599. """ """
  600. pass
  601. def prepend(self, s):
  602. """
  603. Prepend the given sequence to the string
  604. """
  605. pass
  606. def replace(self, pattern, content):
  607. """
  608. Replace the content of the given pattern with the provided content
  609. Args:
  610. pattern: Pattern:
  611. A pattern used to match the string. Usually a string or a Regex
  612. content: str:
  613. The content to be used as replacement
  614. """
  615. pass
  616. def rstrip(self):
  617. """
  618. Strip the right of the string
  619. """
  620. pass
  621. def slice(self, range):
  622. """
  623. Slice the string using the given range
  624. """
  625. pass
  626. def split(self, pattern, behavior):
  627. """
  628. Split the NormalizedString using the given pattern and the specified behavior
  629. Args:
  630. pattern: Pattern:
  631. A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
  632. behavior: SplitDelimiterBehavior:
  633. The behavior to use when splitting.
  634. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
  635. "contiguous"
  636. Returns:
  637. A list of NormalizedString, representing each split
  638. """
  639. pass
  640. def strip(self):
  641. """
  642. Strip both ends of the string
  643. """
  644. pass
  645. def uppercase(self):
  646. """
  647. Uppercase the string
  648. """
  649. pass
  650. class PreTokenizedString:
  651. """
  652. PreTokenizedString
  653. Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
  654. underlying string, while keeping track of the alignment information (offsets).
  655. The PreTokenizedString manages what we call `splits`. Each split represents a substring
  656. which is a subpart of the original string, with the relevant offsets and tokens.
  657. When calling one of the methods used to modify the PreTokenizedString (namely one of
  658. `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
  659. tokens will get modified.
  660. Args:
  661. sequence: str:
  662. The string sequence used to initialize this PreTokenizedString
  663. """
  664. def __init__(self, sequence):
  665. pass
  666. def __getstate__(self, /):
  667. """
  668. Helper for pickle.
  669. """
  670. pass
  671. def get_splits(self, offset_referential="original", offset_type="char"):
  672. """
  673. Get the splits currently managed by the PreTokenizedString
  674. Args:
  675. offset_referential: :obj:`str`
  676. Whether the returned splits should have offsets expressed relative
  677. to the original string, or the normalized one. choices: "original", "normalized".
  678. offset_type: :obj:`str`
  679. Whether the returned splits should have offsets expressed in bytes or chars.
  680. When slicing an str, we usually want to use chars, which is the default value.
  681. Now in some cases it might be interesting to get these offsets expressed in bytes,
  682. so it is possible to change this here.
  683. choices: "char", "bytes"
  684. Returns
  685. A list of splits
  686. """
  687. pass
  688. def normalize(self, func):
  689. """
  690. Normalize each split of the `PreTokenizedString` using the given `func`
  691. Args:
  692. func: Callable[[NormalizedString], None]:
  693. The function used to normalize each underlying split. This function
  694. does not need to return anything, just calling the methods on the provided
  695. NormalizedString allow its modification.
  696. """
  697. pass
  698. def split(self, func):
  699. """
  700. Split the PreTokenizedString using the given `func`
  701. Args:
  702. func: Callable[[index, NormalizedString], List[NormalizedString]]:
  703. The function used to split each underlying split.
  704. It is expected to return a list of `NormalizedString`, that represent the new
  705. splits. If the given `NormalizedString` does not need any splitting, we can
  706. just return it directly.
  707. In order for the offsets to be tracked accurately, any returned `NormalizedString`
  708. should come from calling either `.split` or `.slice` on the received one.
  709. """
  710. pass
  711. def to_encoding(self, type_id=0, word_idx=None):
  712. """
  713. Return an Encoding generated from this PreTokenizedString
  714. Args:
  715. type_id: int = 0:
  716. The type_id to be used on the generated Encoding.
  717. word_idx: Optional[int] = None:
  718. An optional word index to be used for each token of this Encoding. If provided,
  719. all the word indices in the generated Encoding will use this value, instead
  720. of the one automatically tracked during pre-tokenization.
  721. Returns:
  722. An Encoding
  723. """
  724. pass
  725. def tokenize(self, func):
  726. """
  727. Tokenize each split of the `PreTokenizedString` using the given `func`
  728. Args:
  729. func: Callable[[str], List[Token]]:
  730. The function used to tokenize each underlying split. This function must return
  731. a list of Token generated from the input str.
  732. """
  733. pass
  734. class Regex:
  735. """
  736. Instantiate a new Regex with the given pattern
  737. """
  738. def __init__(self, pattern):
  739. pass
  740. def __getstate__(self, /):
  741. """
  742. Helper for pickle.
  743. """
  744. pass
  745. class Token:
  746. def __init__(self, id, value, offsets):
  747. pass
  748. def __getstate__(self, /):
  749. """
  750. Helper for pickle.
  751. """
  752. pass
  753. def as_tuple(self):
  754. """ """
  755. pass
  756. @property
  757. def id(self):
  758. """ """
  759. pass
  760. @id.setter
  761. def id(self, value):
  762. """ """
  763. pass
  764. @property
  765. def offsets(self):
  766. """ """
  767. pass
  768. @offsets.setter
  769. def offsets(self, value):
  770. """ """
  771. pass
  772. @property
  773. def value(self):
  774. """ """
  775. pass
  776. @value.setter
  777. def value(self, value):
  778. """ """
  779. pass
  780. class Tokenizer:
  781. """
  782. A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
  783. and outputs an :class:`~tokenizers.Encoding`.
  784. Args:
  785. model (:class:`~tokenizers.models.Model`):
  786. The core algorithm that this :obj:`Tokenizer` should be using.
  787. """
  788. def __init__(self, model):
  789. pass
  790. def __getnewargs__(self):
  791. """ """
  792. pass
  793. def __getstate__(self):
  794. """ """
  795. pass
  796. def __setstate__(self, state):
  797. """ """
  798. pass
  799. def add_special_tokens(self, tokens):
  800. """
  801. Add the given special tokens to the Tokenizer.
  802. If these tokens are already part of the vocabulary, it just let the Tokenizer know about
  803. them. If they don't exist, the Tokenizer creates them, giving them a new id.
  804. These special tokens will never be processed by the model (ie won't be split into
  805. multiple tokens), and they can be removed from the output when decoding.
  806. Args:
  807. tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
  808. The list of special tokens we want to add to the vocabulary. Each token can either
  809. be a string or an instance of :class:`~tokenizers.AddedToken` for more
  810. customization.
  811. Returns:
  812. :obj:`int`: The number of tokens that were created in the vocabulary
  813. """
  814. pass
  815. def add_tokens(self, tokens):
  816. """
  817. Add the given tokens to the vocabulary
  818. The given tokens are added only if they don't already exist in the vocabulary.
  819. Each token then gets a new attributed id.
  820. Args:
  821. tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
  822. The list of tokens we want to add to the vocabulary. Each token can be either a
  823. string or an instance of :class:`~tokenizers.AddedToken` for more customization.
  824. Returns:
  825. :obj:`int`: The number of tokens that were created in the vocabulary
  826. """
  827. pass
  828. def async_decode_batch(self, sequences, skip_special_tokens=True):
  829. """
  830. Decode a batch of ids back to their corresponding string
  831. Args:
  832. sequences (:obj:`List` of :obj:`List[int]`):
  833. The batch of sequences we want to decode
  834. skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  835. Whether the special tokens should be removed from the decoded strings
  836. Returns:
  837. :obj:`List[str]`: A list of decoded strings
  838. """
  839. pass
  840. def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
  841. """
  842. Asynchronously encode the given input with character offsets.
  843. This is an async version of encode that can be awaited in async Python code.
  844. Example:
  845. Here are some examples of the inputs that are accepted::
  846. await async_encode("A single sequence")
  847. Args:
  848. sequence (:obj:`~tokenizers.InputSequence`):
  849. The main input sequence we want to encode. This sequence can be either raw
  850. text or pre-tokenized, according to the ``is_pretokenized`` argument:
  851. - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
  852. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
  853. pair (:obj:`~tokenizers.InputSequence`, `optional`):
  854. An optional input sequence. The expected format is the same that for ``sequence``.
  855. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  856. Whether the input is already pre-tokenized
  857. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  858. Whether to add the special tokens
  859. Returns:
  860. :class:`~tokenizers.Encoding`: The encoded result
  861. """
  862. pass
  863. def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
  864. """
  865. Asynchronously encode the given batch of inputs with character offsets.
  866. This is an async version of encode_batch that can be awaited in async Python code.
  867. Example:
  868. Here are some examples of the inputs that are accepted::
  869. await async_encode_batch([
  870. "A single sequence",
  871. ("A tuple with a sequence", "And its pair"),
  872. [ "A", "pre", "tokenized", "sequence" ],
  873. ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
  874. ])
  875. Args:
  876. input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
  877. A list of single sequences or pair sequences to encode. Each sequence
  878. can be either raw text or pre-tokenized, according to the ``is_pretokenized``
  879. argument:
  880. - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
  881. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
  882. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  883. Whether the input is already pre-tokenized
  884. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  885. Whether to add the special tokens
  886. Returns:
  887. A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
  888. """
  889. pass
  890. def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
  891. """
  892. Asynchronously encode the given batch of inputs without tracking character offsets.
  893. This is an async version of encode_batch_fast that can be awaited in async Python code.
  894. Example:
  895. Here are some examples of the inputs that are accepted::
  896. await async_encode_batch_fast([
  897. "A single sequence",
  898. ("A tuple with a sequence", "And its pair"),
  899. [ "A", "pre", "tokenized", "sequence" ],
  900. ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
  901. ])
  902. Args:
  903. input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
  904. A list of single sequences or pair sequences to encode. Each sequence
  905. can be either raw text or pre-tokenized, according to the ``is_pretokenized``
  906. argument:
  907. - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
  908. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
  909. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  910. Whether the input is already pre-tokenized
  911. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  912. Whether to add the special tokens
  913. Returns:
  914. A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
  915. """
  916. pass
  917. def decode(self, ids, skip_special_tokens=True):
  918. """
  919. Decode the given list of ids back to a string
  920. This is used to decode anything coming back from a Language Model
  921. Args:
  922. ids (A :obj:`List/Tuple` of :obj:`int`):
  923. The list of ids that we want to decode
  924. skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  925. Whether the special tokens should be removed from the decoded string
  926. Returns:
  927. :obj:`str`: The decoded string
  928. """
  929. pass
  930. def decode_batch(self, sequences, skip_special_tokens=True):
  931. """
  932. Decode a batch of ids back to their corresponding string
  933. Args:
  934. sequences (:obj:`List` of :obj:`List[int]`):
  935. The batch of sequences we want to decode
  936. skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  937. Whether the special tokens should be removed from the decoded strings
  938. Returns:
  939. :obj:`List[str]`: A list of decoded strings
  940. """
  941. pass
  942. @property
  943. def decoder(self):
  944. """
  945. The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
  946. """
  947. pass
  948. @decoder.setter
  949. def decoder(self, value):
  950. """
  951. The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
  952. """
  953. pass
  954. def enable_padding(
  955. self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
  956. ):
  957. """
  958. Enable the padding
  959. Args:
  960. direction (:obj:`str`, `optional`, defaults to :obj:`right`):
  961. The direction in which to pad. Can be either ``right`` or ``left``
  962. pad_to_multiple_of (:obj:`int`, `optional`):
  963. If specified, the padding length should always snap to the next multiple of the
  964. given value. For example if we were going to pad witha length of 250 but
  965. ``pad_to_multiple_of=8`` then we will pad to 256.
  966. pad_id (:obj:`int`, defaults to 0):
  967. The id to be used when padding
  968. pad_type_id (:obj:`int`, defaults to 0):
  969. The type id to be used when padding
  970. pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
  971. The pad token to be used when padding
  972. length (:obj:`int`, `optional`):
  973. If specified, the length at which to pad. If not specified we pad using the size of
  974. the longest sequence in a batch.
  975. """
  976. pass
  977. def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
  978. """
  979. Enable truncation
  980. Args:
  981. max_length (:obj:`int`):
  982. The max length at which to truncate
  983. stride (:obj:`int`, `optional`):
  984. The length of the previous first sequence to be included in the overflowing
  985. sequence
  986. strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
  987. The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
  988. ``only_second``.
  989. direction (:obj:`str`, defaults to :obj:`right`):
  990. Truncate direction
  991. """
  992. pass
  993. def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
  994. """
  995. Encode the given sequence and pair. This method can process raw text sequences
  996. as well as already pre-tokenized sequences.
  997. Example:
  998. Here are some examples of the inputs that are accepted::
  999. encode("A single sequence")`
  1000. encode("A sequence", "And its pair")`
  1001. encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
  1002. encode(
  1003. [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
  1004. is_pretokenized=True
  1005. )
  1006. Args:
  1007. sequence (:obj:`~tokenizers.InputSequence`):
  1008. The main input sequence we want to encode. This sequence can be either raw
  1009. text or pre-tokenized, according to the ``is_pretokenized`` argument:
  1010. - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
  1011. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
  1012. pair (:obj:`~tokenizers.InputSequence`, `optional`):
  1013. An optional input sequence. The expected format is the same that for ``sequence``.
  1014. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  1015. Whether the input is already pre-tokenized
  1016. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  1017. Whether to add the special tokens
  1018. Returns:
  1019. :class:`~tokenizers.Encoding`: The encoded result
  1020. """
  1021. pass
  1022. def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
  1023. """
  1024. Encode the given batch of inputs. This method accept both raw text sequences
  1025. as well as already pre-tokenized sequences. The reason we use `PySequence` is
  1026. because it allows type checking with zero-cost (according to PyO3) as we don't
  1027. have to convert to check.
  1028. Example:
  1029. Here are some examples of the inputs that are accepted::
  1030. encode_batch([
  1031. "A single sequence",
  1032. ("A tuple with a sequence", "And its pair"),
  1033. [ "A", "pre", "tokenized", "sequence" ],
  1034. ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
  1035. ])
  1036. Args:
  1037. input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
  1038. A list of single sequences or pair sequences to encode. Each sequence
  1039. can be either raw text or pre-tokenized, according to the ``is_pretokenized``
  1040. argument:
  1041. - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
  1042. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
  1043. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  1044. Whether the input is already pre-tokenized
  1045. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  1046. Whether to add the special tokens
  1047. Returns:
  1048. A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
  1049. """
  1050. pass
  1051. def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
  1052. """
  1053. Encode the given batch of inputs. This method is faster than `encode_batch`
  1054. because it doesn't keep track of offsets, they will be all zeros.
  1055. Example:
  1056. Here are some examples of the inputs that are accepted::
  1057. encode_batch_fast([
  1058. "A single sequence",
  1059. ("A tuple with a sequence", "And its pair"),
  1060. [ "A", "pre", "tokenized", "sequence" ],
  1061. ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
  1062. ])
  1063. Args:
  1064. input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
  1065. A list of single sequences or pair sequences to encode. Each sequence
  1066. can be either raw text or pre-tokenized, according to the ``is_pretokenized``
  1067. argument:
  1068. - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
  1069. - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
  1070. is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
  1071. Whether the input is already pre-tokenized
  1072. add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
  1073. Whether to add the special tokens
  1074. Returns:
  1075. A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
  1076. """
  1077. pass
  1078. @property
  1079. def encode_special_tokens(self):
  1080. """
  1081. Modifies the tokenizer in order to use or not the special tokens
  1082. during encoding.
  1083. Args:
  1084. value (:obj:`bool`):
  1085. Whether to use the special tokens or not
  1086. """
  1087. pass
  1088. @encode_special_tokens.setter
  1089. def encode_special_tokens(self, value):
  1090. """
  1091. Modifies the tokenizer in order to use or not the special tokens
  1092. during encoding.
  1093. Args:
  1094. value (:obj:`bool`):
  1095. Whether to use the special tokens or not
  1096. """
  1097. pass
  1098. @staticmethod
  1099. def from_buffer(buffer):
  1100. """
  1101. Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
  1102. Args:
  1103. buffer (:obj:`bytes`):
  1104. A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
  1105. Returns:
  1106. :class:`~tokenizers.Tokenizer`: The new tokenizer
  1107. """
  1108. pass
  1109. @staticmethod
  1110. def from_file(path):
  1111. """
  1112. Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
  1113. Args:
  1114. path (:obj:`str`):
  1115. A path to a local JSON file representing a previously serialized
  1116. :class:`~tokenizers.Tokenizer`
  1117. Returns:
  1118. :class:`~tokenizers.Tokenizer`: The new tokenizer
  1119. """
  1120. pass
  1121. @staticmethod
  1122. def from_pretrained(identifier, revision="main", token=None):
  1123. """
  1124. Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
  1125. Hugging Face Hub.
  1126. Args:
  1127. identifier (:obj:`str`):
  1128. The identifier of a Model on the Hugging Face Hub, that contains
  1129. a tokenizer.json file
  1130. revision (:obj:`str`, defaults to `main`):
  1131. A branch or commit id
  1132. token (:obj:`str`, `optional`, defaults to `None`):
  1133. An optional auth token used to access private repositories on the
  1134. Hugging Face Hub
  1135. Returns:
  1136. :class:`~tokenizers.Tokenizer`: The new tokenizer
  1137. """
  1138. pass
  1139. @staticmethod
  1140. def from_str(json):
  1141. """
  1142. Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
  1143. Args:
  1144. json (:obj:`str`):
  1145. A valid JSON string representing a previously serialized
  1146. :class:`~tokenizers.Tokenizer`
  1147. Returns:
  1148. :class:`~tokenizers.Tokenizer`: The new tokenizer
  1149. """
  1150. pass
  1151. def get_added_tokens_decoder(self):
  1152. """
  1153. Get the underlying vocabulary
  1154. Returns:
  1155. :obj:`Dict[int, AddedToken]`: The vocabulary
  1156. """
  1157. pass
  1158. def get_vocab(self, with_added_tokens=True):
  1159. """
  1160. Get the underlying vocabulary
  1161. Args:
  1162. with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
  1163. Whether to include the added tokens
  1164. Returns:
  1165. :obj:`Dict[str, int]`: The vocabulary
  1166. """
  1167. pass
  1168. def get_vocab_size(self, with_added_tokens=True):
  1169. """
  1170. Get the size of the underlying vocabulary
  1171. Args:
  1172. with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
  1173. Whether to include the added tokens
  1174. Returns:
  1175. :obj:`int`: The size of the vocabulary
  1176. """
  1177. pass
  1178. def id_to_token(self, id):
  1179. """
  1180. Convert the given id to its corresponding token if it exists
  1181. Args:
  1182. id (:obj:`int`):
  1183. The id to convert
  1184. Returns:
  1185. :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
  1186. """
  1187. pass
  1188. @property
  1189. def model(self):
  1190. """
  1191. The :class:`~tokenizers.models.Model` in use by the Tokenizer
  1192. """
  1193. pass
  1194. @model.setter
  1195. def model(self, value):
  1196. """
  1197. The :class:`~tokenizers.models.Model` in use by the Tokenizer
  1198. """
  1199. pass
  1200. def no_padding(self):
  1201. """
  1202. Disable padding
  1203. """
  1204. pass
  1205. def no_truncation(self):
  1206. """
  1207. Disable truncation
  1208. """
  1209. pass
  1210. @property
  1211. def normalizer(self):
  1212. """
  1213. The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
  1214. """
  1215. pass
  1216. @normalizer.setter
  1217. def normalizer(self, value):
  1218. """
  1219. The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
  1220. """
  1221. pass
  1222. def num_special_tokens_to_add(self, is_pair):
  1223. """
  1224. Return the number of special tokens that would be added for single/pair sentences.
  1225. :param is_pair: Boolean indicating if the input would be a single sentence or a pair
  1226. :return:
  1227. """
  1228. pass
  1229. @property
  1230. def padding(self):
  1231. """
  1232. Get the current padding parameters
  1233. `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
  1234. Returns:
  1235. (:obj:`dict`, `optional`):
  1236. A dict with the current padding parameters if padding is enabled
  1237. """
  1238. pass
  1239. @padding.setter
  1240. def padding(self, value):
  1241. """
  1242. Get the current padding parameters
  1243. `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
  1244. Returns:
  1245. (:obj:`dict`, `optional`):
  1246. A dict with the current padding parameters if padding is enabled
  1247. """
  1248. pass
  1249. def post_process(self, encoding, pair=None, add_special_tokens=True):
  1250. """
  1251. Apply all the post-processing steps to the given encodings.
  1252. The various steps are:
  1253. 1. Truncate according to the set truncation params (provided with
  1254. :meth:`~tokenizers.Tokenizer.enable_truncation`)
  1255. 2. Apply the :class:`~tokenizers.processors.PostProcessor`
  1256. 3. Pad according to the set padding params (provided with
  1257. :meth:`~tokenizers.Tokenizer.enable_padding`)
  1258. Args:
  1259. encoding (:class:`~tokenizers.Encoding`):
  1260. The :class:`~tokenizers.Encoding` corresponding to the main sequence.
  1261. pair (:class:`~tokenizers.Encoding`, `optional`):
  1262. An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
  1263. add_special_tokens (:obj:`bool`):
  1264. Whether to add the special tokens
  1265. Returns:
  1266. :class:`~tokenizers.Encoding`: The final post-processed encoding
  1267. """
  1268. pass
  1269. @property
  1270. def post_processor(self):
  1271. """
  1272. The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
  1273. """
  1274. pass
  1275. @post_processor.setter
  1276. def post_processor(self, value):
  1277. """
  1278. The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
  1279. """
  1280. pass
  1281. @property
  1282. def pre_tokenizer(self):
  1283. """
  1284. The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
  1285. """
  1286. pass
  1287. @pre_tokenizer.setter
  1288. def pre_tokenizer(self, value):
  1289. """
  1290. The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
  1291. """
  1292. pass
  1293. def save(self, path, pretty=True):
  1294. """
  1295. Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
  1296. Args:
  1297. path (:obj:`str`):
  1298. A path to a file in which to save the serialized tokenizer.
  1299. pretty (:obj:`bool`, defaults to :obj:`True`):
  1300. Whether the JSON file should be pretty formatted.
  1301. """
  1302. pass
  1303. def to_str(self, pretty=False):
  1304. """
  1305. Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
  1306. Args:
  1307. pretty (:obj:`bool`, defaults to :obj:`False`):
  1308. Whether the JSON string should be pretty formatted.
  1309. Returns:
  1310. :obj:`str`: A string representing the serialized Tokenizer
  1311. """
  1312. pass
  1313. def token_to_id(self, token):
  1314. """
  1315. Convert the given token to its corresponding id if it exists
  1316. Args:
  1317. token (:obj:`str`):
  1318. The token to convert
  1319. Returns:
  1320. :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
  1321. """
  1322. pass
  1323. def train(self, files, trainer=None):
  1324. """
  1325. Train the Tokenizer using the given files.
  1326. Reads the files line by line, while keeping all the whitespace, even new lines.
  1327. If you want to train from data store in-memory, you can check
  1328. :meth:`~tokenizers.Tokenizer.train_from_iterator`
  1329. Args:
  1330. files (:obj:`List[str]`):
  1331. A list of path to the files that we should use for training
  1332. trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
  1333. An optional trainer that should be used to train our Model
  1334. """
  1335. pass
  1336. def train_from_iterator(self, iterator, trainer=None, length=None):
  1337. """
  1338. Train the Tokenizer using the provided iterator.
  1339. You can provide anything that is a Python Iterator
  1340. * A list of sequences :obj:`List[str]`
  1341. * A generator that yields :obj:`str` or :obj:`List[str]`
  1342. * A Numpy array of strings
  1343. * ...
  1344. Args:
  1345. iterator (:obj:`Iterator`):
  1346. Any iterator over strings or list of strings
  1347. trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
  1348. An optional trainer that should be used to train our Model
  1349. length (:obj:`int`, `optional`):
  1350. The total number of sequences in the iterator. This is used to
  1351. provide meaningful progress tracking
  1352. """
  1353. pass
  1354. @property
  1355. def truncation(self):
  1356. """
  1357. Get the currently set truncation parameters
  1358. `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
  1359. Returns:
  1360. (:obj:`dict`, `optional`):
  1361. A dict with the current truncation parameters if truncation is enabled
  1362. """
  1363. pass
  1364. @truncation.setter
  1365. def truncation(self, value):
  1366. """
  1367. Get the currently set truncation parameters
  1368. `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
  1369. Returns:
  1370. (:obj:`dict`, `optional`):
  1371. A dict with the current truncation parameters if truncation is enabled
  1372. """
  1373. pass
  1374. from enum import Enum
  1375. from typing import List, Tuple, Union, Any
  1376. Offsets = Tuple[int, int]
  1377. TextInputSequence = str
  1378. PreTokenizedInputSequence = Union[List[str], Tuple[str, ...]]
  1379. TextEncodeInput = Union[
  1380. TextInputSequence,
  1381. Tuple[TextInputSequence, TextInputSequence],
  1382. List[TextInputSequence],
  1383. ]
  1384. PreTokenizedEncodeInput = Union[
  1385. PreTokenizedInputSequence,
  1386. Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
  1387. List[PreTokenizedInputSequence],
  1388. ]
  1389. InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
  1390. EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
  1391. class OffsetReferential(Enum):
  1392. ORIGINAL = "original"
  1393. NORMALIZED = "normalized"
  1394. class OffsetType(Enum):
  1395. BYTE = "byte"
  1396. CHAR = "char"
  1397. class SplitDelimiterBehavior(Enum):
  1398. REMOVED = "removed"
  1399. ISOLATED = "isolated"
  1400. MERGED_WITH_PREVIOUS = "merged_with_previous"
  1401. MERGED_WITH_NEXT = "merged_with_next"
  1402. CONTIGUOUS = "contiguous"
  1403. from .implementations import (
  1404. BertWordPieceTokenizer,
  1405. ByteLevelBPETokenizer,
  1406. CharBPETokenizer,
  1407. SentencePieceBPETokenizer,
  1408. SentencePieceUnigramTokenizer,
  1409. )
  1410. def __getattr__(name: str) -> Any: ...
  1411. BertWordPieceTokenizer: Any
  1412. ByteLevelBPETokenizer: Any
  1413. CharBPETokenizer: Any
  1414. SentencePieceBPETokenizer: Any
  1415. SentencePieceUnigramTokenizer: Any