__init__.pyi 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744
  1. # Generated content DO NOT EDIT
  2. class Model:
  3. """
  4. Base class for all models
  5. The model represents the actual tokenization algorithm. This is the part that
  6. will contain and manage the learned vocabulary.
  7. This class cannot be constructed directly. Please use one of the concrete models.
  8. """
  9. def __init__(self):
  10. pass
  11. def __getstate__(self):
  12. """ """
  13. pass
  14. def __setstate__(self, state):
  15. """ """
  16. pass
  17. def get_trainer(self):
  18. """
  19. Get the associated :class:`~tokenizers.trainers.Trainer`
  20. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  21. :class:`~tokenizers.models.Model`.
  22. Returns:
  23. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  24. """
  25. pass
  26. def id_to_token(self, id):
  27. """
  28. Get the token associated to an ID
  29. Args:
  30. id (:obj:`int`):
  31. An ID to convert to a token
  32. Returns:
  33. :obj:`str`: The token associated to the ID
  34. """
  35. pass
  36. def save(self, folder, prefix):
  37. """
  38. Save the current model
  39. Save the current model in the given folder, using the given prefix for the various
  40. files that will get created.
  41. Any file with the same name that already exists in this folder will be overwritten.
  42. Args:
  43. folder (:obj:`str`):
  44. The path to the target folder in which to save the various files
  45. prefix (:obj:`str`, `optional`):
  46. An optional prefix, used to prefix each file name
  47. Returns:
  48. :obj:`List[str]`: The list of saved files
  49. """
  50. pass
  51. def token_to_id(self, tokens):
  52. """
  53. Get the ID associated to a token
  54. Args:
  55. token (:obj:`str`):
  56. A token to convert to an ID
  57. Returns:
  58. :obj:`int`: The ID associated to the token
  59. """
  60. pass
  61. def tokenize(self, sequence):
  62. """
  63. Tokenize a sequence
  64. Args:
  65. sequence (:obj:`str`):
  66. A sequence to tokenize
  67. Returns:
  68. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  69. """
  70. pass
  71. class BPE(Model):
  72. """
  73. An implementation of the BPE (Byte-Pair Encoding) algorithm
  74. Args:
  75. vocab (:obj:`Dict[str, int]`, `optional`):
  76. A dictionary of string keys and their ids :obj:`{"am": 0,...}`
  77. merges (:obj:`List[Tuple[str, str]]`, `optional`):
  78. A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
  79. cache_capacity (:obj:`int`, `optional`):
  80. The number of words that the BPE cache can contain. The cache allows
  81. to speed-up the process by keeping the result of the merge operations
  82. for a number of words.
  83. dropout (:obj:`float`, `optional`):
  84. A float between 0 and 1 that represents the BPE dropout to use.
  85. unk_token (:obj:`str`, `optional`):
  86. The unknown token to be used by the model.
  87. continuing_subword_prefix (:obj:`str`, `optional`):
  88. The prefix to attach to subword units that don't represent a beginning of word.
  89. end_of_word_suffix (:obj:`str`, `optional`):
  90. The suffix to attach to subword units that represent an end of word.
  91. fuse_unk (:obj:`bool`, `optional`):
  92. Whether to fuse any subsequent unknown tokens into a single one
  93. byte_fallback (:obj:`bool`, `optional`):
  94. Whether to use spm byte-fallback trick (defaults to False)
  95. ignore_merges (:obj:`bool`, `optional`):
  96. Whether or not to match tokens with the vocab before using merges.
  97. """
  98. def __init__(
  99. self,
  100. vocab=None,
  101. merges=None,
  102. cache_capacity=None,
  103. dropout=None,
  104. unk_token=None,
  105. continuing_subword_prefix=None,
  106. end_of_word_suffix=None,
  107. fuse_unk=None,
  108. byte_fallback=False,
  109. ignore_merges=False,
  110. ):
  111. pass
  112. def __getstate__(self):
  113. """ """
  114. pass
  115. def __setstate__(self, state):
  116. """ """
  117. pass
  118. @property
  119. def byte_fallback(self):
  120. """ """
  121. pass
  122. @byte_fallback.setter
  123. def byte_fallback(self, value):
  124. """ """
  125. pass
  126. @property
  127. def continuing_subword_prefix(self):
  128. """ """
  129. pass
  130. @continuing_subword_prefix.setter
  131. def continuing_subword_prefix(self, value):
  132. """ """
  133. pass
  134. @property
  135. def dropout(self):
  136. """ """
  137. pass
  138. @dropout.setter
  139. def dropout(self, value):
  140. """ """
  141. pass
  142. @property
  143. def end_of_word_suffix(self):
  144. """ """
  145. pass
  146. @end_of_word_suffix.setter
  147. def end_of_word_suffix(self, value):
  148. """ """
  149. pass
  150. @staticmethod
  151. def from_file(vocab, merges, **kwargs):
  152. """
  153. Instantiate a BPE model from the given files.
  154. This method is roughly equivalent to doing::
  155. vocab, merges = BPE.read_file(vocab_filename, merges_filename)
  156. bpe = BPE(vocab, merges)
  157. If you don't need to keep the :obj:`vocab, merges` values lying around,
  158. this method is more optimized than manually calling
  159. :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
  160. Args:
  161. vocab (:obj:`str`):
  162. The path to a :obj:`vocab.json` file
  163. merges (:obj:`str`):
  164. The path to a :obj:`merges.txt` file
  165. Returns:
  166. :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
  167. """
  168. pass
  169. @property
  170. def fuse_unk(self):
  171. """ """
  172. pass
  173. @fuse_unk.setter
  174. def fuse_unk(self, value):
  175. """ """
  176. pass
  177. def get_trainer(self):
  178. """
  179. Get the associated :class:`~tokenizers.trainers.Trainer`
  180. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  181. :class:`~tokenizers.models.Model`.
  182. Returns:
  183. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  184. """
  185. pass
  186. def id_to_token(self, id):
  187. """
  188. Get the token associated to an ID
  189. Args:
  190. id (:obj:`int`):
  191. An ID to convert to a token
  192. Returns:
  193. :obj:`str`: The token associated to the ID
  194. """
  195. pass
  196. @property
  197. def ignore_merges(self):
  198. """ """
  199. pass
  200. @ignore_merges.setter
  201. def ignore_merges(self, value):
  202. """ """
  203. pass
  204. @staticmethod
  205. def read_file(vocab, merges):
  206. """
  207. Read a :obj:`vocab.json` and a :obj:`merges.txt` files
  208. This method provides a way to read and parse the content of these files,
  209. returning the relevant data structures. If you want to instantiate some BPE models
  210. from memory, this method gives you the expected input from the standard files.
  211. Args:
  212. vocab (:obj:`str`):
  213. The path to a :obj:`vocab.json` file
  214. merges (:obj:`str`):
  215. The path to a :obj:`merges.txt` file
  216. Returns:
  217. A :obj:`Tuple` with the vocab and the merges:
  218. The vocabulary and merges loaded into memory
  219. """
  220. pass
  221. def save(self, folder, prefix):
  222. """
  223. Save the current model
  224. Save the current model in the given folder, using the given prefix for the various
  225. files that will get created.
  226. Any file with the same name that already exists in this folder will be overwritten.
  227. Args:
  228. folder (:obj:`str`):
  229. The path to the target folder in which to save the various files
  230. prefix (:obj:`str`, `optional`):
  231. An optional prefix, used to prefix each file name
  232. Returns:
  233. :obj:`List[str]`: The list of saved files
  234. """
  235. pass
  236. def token_to_id(self, tokens):
  237. """
  238. Get the ID associated to a token
  239. Args:
  240. token (:obj:`str`):
  241. A token to convert to an ID
  242. Returns:
  243. :obj:`int`: The ID associated to the token
  244. """
  245. pass
  246. def tokenize(self, sequence):
  247. """
  248. Tokenize a sequence
  249. Args:
  250. sequence (:obj:`str`):
  251. A sequence to tokenize
  252. Returns:
  253. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  254. """
  255. pass
  256. @property
  257. def unk_token(self):
  258. """ """
  259. pass
  260. @unk_token.setter
  261. def unk_token(self, value):
  262. """ """
  263. pass
  264. class Unigram(Model):
  265. """
  266. An implementation of the Unigram algorithm
  267. Args:
  268. vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
  269. A list of vocabulary items and their relative score [("am", -0.2442),...]
  270. """
  271. def __init__(self, vocab=None, unk_id=None, byte_fallback=None):
  272. pass
  273. def __getstate__(self):
  274. """ """
  275. pass
  276. def __setstate__(self, state):
  277. """ """
  278. pass
  279. def get_trainer(self):
  280. """
  281. Get the associated :class:`~tokenizers.trainers.Trainer`
  282. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  283. :class:`~tokenizers.models.Model`.
  284. Returns:
  285. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  286. """
  287. pass
  288. def id_to_token(self, id):
  289. """
  290. Get the token associated to an ID
  291. Args:
  292. id (:obj:`int`):
  293. An ID to convert to a token
  294. Returns:
  295. :obj:`str`: The token associated to the ID
  296. """
  297. pass
  298. def save(self, folder, prefix):
  299. """
  300. Save the current model
  301. Save the current model in the given folder, using the given prefix for the various
  302. files that will get created.
  303. Any file with the same name that already exists in this folder will be overwritten.
  304. Args:
  305. folder (:obj:`str`):
  306. The path to the target folder in which to save the various files
  307. prefix (:obj:`str`, `optional`):
  308. An optional prefix, used to prefix each file name
  309. Returns:
  310. :obj:`List[str]`: The list of saved files
  311. """
  312. pass
  313. def token_to_id(self, tokens):
  314. """
  315. Get the ID associated to a token
  316. Args:
  317. token (:obj:`str`):
  318. A token to convert to an ID
  319. Returns:
  320. :obj:`int`: The ID associated to the token
  321. """
  322. pass
  323. def tokenize(self, sequence):
  324. """
  325. Tokenize a sequence
  326. Args:
  327. sequence (:obj:`str`):
  328. A sequence to tokenize
  329. Returns:
  330. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  331. """
  332. pass
  333. class WordLevel(Model):
  334. """
  335. An implementation of the WordLevel algorithm
  336. Most simple tokenizer model based on mapping tokens to their corresponding id.
  337. Args:
  338. vocab (:obj:`str`, `optional`):
  339. A dictionary of string keys and their ids :obj:`{"am": 0,...}`
  340. unk_token (:obj:`str`, `optional`):
  341. The unknown token to be used by the model.
  342. """
  343. def __init__(self, vocab=None, unk_token=None):
  344. pass
  345. def __getstate__(self):
  346. """ """
  347. pass
  348. def __setstate__(self, state):
  349. """ """
  350. pass
  351. @staticmethod
  352. def from_file(vocab, unk_token=None):
  353. """
  354. Instantiate a WordLevel model from the given file
  355. This method is roughly equivalent to doing::
  356. vocab = WordLevel.read_file(vocab_filename)
  357. wordlevel = WordLevel(vocab)
  358. If you don't need to keep the :obj:`vocab` values lying around, this method is
  359. more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
  360. initialize a :class:`~tokenizers.models.WordLevel`
  361. Args:
  362. vocab (:obj:`str`):
  363. The path to a :obj:`vocab.json` file
  364. Returns:
  365. :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
  366. """
  367. pass
  368. def get_trainer(self):
  369. """
  370. Get the associated :class:`~tokenizers.trainers.Trainer`
  371. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  372. :class:`~tokenizers.models.Model`.
  373. Returns:
  374. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  375. """
  376. pass
  377. def id_to_token(self, id):
  378. """
  379. Get the token associated to an ID
  380. Args:
  381. id (:obj:`int`):
  382. An ID to convert to a token
  383. Returns:
  384. :obj:`str`: The token associated to the ID
  385. """
  386. pass
  387. @staticmethod
  388. def read_file(vocab):
  389. """
  390. Read a :obj:`vocab.json`
  391. This method provides a way to read and parse the content of a vocabulary file,
  392. returning the relevant data structures. If you want to instantiate some WordLevel models
  393. from memory, this method gives you the expected input from the standard files.
  394. Args:
  395. vocab (:obj:`str`):
  396. The path to a :obj:`vocab.json` file
  397. Returns:
  398. :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
  399. """
  400. pass
  401. def save(self, folder, prefix):
  402. """
  403. Save the current model
  404. Save the current model in the given folder, using the given prefix for the various
  405. files that will get created.
  406. Any file with the same name that already exists in this folder will be overwritten.
  407. Args:
  408. folder (:obj:`str`):
  409. The path to the target folder in which to save the various files
  410. prefix (:obj:`str`, `optional`):
  411. An optional prefix, used to prefix each file name
  412. Returns:
  413. :obj:`List[str]`: The list of saved files
  414. """
  415. pass
  416. def token_to_id(self, tokens):
  417. """
  418. Get the ID associated to a token
  419. Args:
  420. token (:obj:`str`):
  421. A token to convert to an ID
  422. Returns:
  423. :obj:`int`: The ID associated to the token
  424. """
  425. pass
  426. def tokenize(self, sequence):
  427. """
  428. Tokenize a sequence
  429. Args:
  430. sequence (:obj:`str`):
  431. A sequence to tokenize
  432. Returns:
  433. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  434. """
  435. pass
  436. @property
  437. def unk_token(self):
  438. """ """
  439. pass
  440. @unk_token.setter
  441. def unk_token(self, value):
  442. """ """
  443. pass
  444. class WordPiece(Model):
  445. """
  446. An implementation of the WordPiece algorithm
  447. Args:
  448. vocab (:obj:`Dict[str, int]`, `optional`):
  449. A dictionary of string keys and their ids :obj:`{"am": 0,...}`
  450. unk_token (:obj:`str`, `optional`):
  451. The unknown token to be used by the model.
  452. max_input_chars_per_word (:obj:`int`, `optional`):
  453. The maximum number of characters to authorize in a single word.
  454. """
  455. def __init__(self, vocab=None, unk_token="[UNK]", max_input_chars_per_word=100, continuing_subword_prefix="##"):
  456. pass
  457. def __getstate__(self):
  458. """ """
  459. pass
  460. def __setstate__(self, state):
  461. """ """
  462. pass
  463. @property
  464. def continuing_subword_prefix(self):
  465. """ """
  466. pass
  467. @continuing_subword_prefix.setter
  468. def continuing_subword_prefix(self, value):
  469. """ """
  470. pass
  471. @staticmethod
  472. def from_file(vocab, **kwargs):
  473. """
  474. Instantiate a WordPiece model from the given file
  475. This method is roughly equivalent to doing::
  476. vocab = WordPiece.read_file(vocab_filename)
  477. wordpiece = WordPiece(vocab)
  478. If you don't need to keep the :obj:`vocab` values lying around, this method is
  479. more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
  480. initialize a :class:`~tokenizers.models.WordPiece`
  481. Args:
  482. vocab (:obj:`str`):
  483. The path to a :obj:`vocab.txt` file
  484. Returns:
  485. :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
  486. """
  487. pass
  488. def get_trainer(self):
  489. """
  490. Get the associated :class:`~tokenizers.trainers.Trainer`
  491. Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
  492. :class:`~tokenizers.models.Model`.
  493. Returns:
  494. :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
  495. """
  496. pass
  497. def id_to_token(self, id):
  498. """
  499. Get the token associated to an ID
  500. Args:
  501. id (:obj:`int`):
  502. An ID to convert to a token
  503. Returns:
  504. :obj:`str`: The token associated to the ID
  505. """
  506. pass
  507. @property
  508. def max_input_chars_per_word(self):
  509. """ """
  510. pass
  511. @max_input_chars_per_word.setter
  512. def max_input_chars_per_word(self, value):
  513. """ """
  514. pass
  515. @staticmethod
  516. def read_file(vocab):
  517. """
  518. Read a :obj:`vocab.txt` file
  519. This method provides a way to read and parse the content of a standard `vocab.txt`
  520. file as used by the WordPiece Model, returning the relevant data structures. If you
  521. want to instantiate some WordPiece models from memory, this method gives you the
  522. expected input from the standard files.
  523. Args:
  524. vocab (:obj:`str`):
  525. The path to a :obj:`vocab.txt` file
  526. Returns:
  527. :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
  528. """
  529. pass
  530. def save(self, folder, prefix):
  531. """
  532. Save the current model
  533. Save the current model in the given folder, using the given prefix for the various
  534. files that will get created.
  535. Any file with the same name that already exists in this folder will be overwritten.
  536. Args:
  537. folder (:obj:`str`):
  538. The path to the target folder in which to save the various files
  539. prefix (:obj:`str`, `optional`):
  540. An optional prefix, used to prefix each file name
  541. Returns:
  542. :obj:`List[str]`: The list of saved files
  543. """
  544. pass
  545. def token_to_id(self, tokens):
  546. """
  547. Get the ID associated to a token
  548. Args:
  549. token (:obj:`str`):
  550. A token to convert to an ID
  551. Returns:
  552. :obj:`int`: The ID associated to the token
  553. """
  554. pass
  555. def tokenize(self, sequence):
  556. """
  557. Tokenize a sequence
  558. Args:
  559. sequence (:obj:`str`):
  560. A sequence to tokenize
  561. Returns:
  562. A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
  563. """
  564. pass
  565. @property
  566. def unk_token(self):
  567. """ """
  568. pass
  569. @unk_token.setter
  570. def unk_token(self, value):
  571. """ """
  572. pass