__init__.pyi 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. # Generated content DO NOT EDIT
  2. class Trainer:
  3. """
  4. Base class for all trainers
  5. This class is not supposed to be instantiated directly. Instead, any implementation of a
  6. Trainer will return an instance of this class when instantiated.
  7. """
  8. class BpeTrainer(Trainer):
  9. """
  10. Trainer capable of training a BPE model
  11. Args:
  12. vocab_size (:obj:`int`, `optional`):
  13. The size of the final vocabulary, including all tokens and alphabet.
  14. min_frequency (:obj:`int`, `optional`):
  15. The minimum frequency a pair should have in order to be merged.
  16. show_progress (:obj:`bool`, `optional`):
  17. Whether to show progress bars while training.
  18. special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
  19. A list of special tokens the model should know of.
  20. limit_alphabet (:obj:`int`, `optional`):
  21. The maximum different characters to keep in the alphabet.
  22. initial_alphabet (:obj:`List[str]`, `optional`):
  23. A list of characters to include in the initial alphabet, even
  24. if not seen in the training dataset.
  25. If the strings contain more than one character, only the first one
  26. is kept.
  27. continuing_subword_prefix (:obj:`str`, `optional`):
  28. A prefix to be used for every subword that is not a beginning-of-word.
  29. end_of_word_suffix (:obj:`str`, `optional`):
  30. A suffix to be used for every subword that is a end-of-word.
  31. max_token_length (:obj:`int`, `optional`):
  32. Prevents creating tokens longer than the specified size.
  33. This can help with reducing polluting your vocabulary with
  34. highly repetitive tokens like `======` for wikipedia
  35. """
  36. def __init__(
  37. self,
  38. vocab_size=30000,
  39. min_frequency=0,
  40. show_progress=True,
  41. special_tokens=[],
  42. limit_alphabet=None,
  43. initial_alphabet=[],
  44. continuing_subword_prefix=None,
  45. end_of_word_suffix=None,
  46. max_token_length=None,
  47. words={},
  48. ):
  49. pass
  50. class UnigramTrainer(Trainer):
  51. """
  52. Trainer capable of training a Unigram model
  53. Args:
  54. vocab_size (:obj:`int`):
  55. The size of the final vocabulary, including all tokens and alphabet.
  56. show_progress (:obj:`bool`):
  57. Whether to show progress bars while training.
  58. special_tokens (:obj:`List[Union[str, AddedToken]]`):
  59. A list of special tokens the model should know of.
  60. initial_alphabet (:obj:`List[str]`):
  61. A list of characters to include in the initial alphabet, even
  62. if not seen in the training dataset.
  63. If the strings contain more than one character, only the first one
  64. is kept.
  65. shrinking_factor (:obj:`float`):
  66. The shrinking factor used at each step of the training to prune the
  67. vocabulary.
  68. unk_token (:obj:`str`):
  69. The token used for out-of-vocabulary tokens.
  70. max_piece_length (:obj:`int`):
  71. The maximum length of a given token.
  72. n_sub_iterations (:obj:`int`):
  73. The number of iterations of the EM algorithm to perform before
  74. pruning the vocabulary.
  75. """
  76. def __init__(
  77. self,
  78. vocab_size=8000,
  79. show_progress=True,
  80. special_tokens=[],
  81. initial_alphabet=[],
  82. shrinking_factor=0.75,
  83. unk_token=None,
  84. max_piece_length=16,
  85. n_sub_iterations=2,
  86. ):
  87. pass
  88. class WordLevelTrainer(Trainer):
  89. """
  90. Trainer capable of training a WorldLevel model
  91. Args:
  92. vocab_size (:obj:`int`, `optional`):
  93. The size of the final vocabulary, including all tokens and alphabet.
  94. min_frequency (:obj:`int`, `optional`):
  95. The minimum frequency a pair should have in order to be merged.
  96. show_progress (:obj:`bool`, `optional`):
  97. Whether to show progress bars while training.
  98. special_tokens (:obj:`List[Union[str, AddedToken]]`):
  99. A list of special tokens the model should know of.
  100. """
  101. def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]):
  102. pass
  103. class WordPieceTrainer(Trainer):
  104. """
  105. Trainer capable of training a WordPiece model
  106. Args:
  107. vocab_size (:obj:`int`, `optional`):
  108. The size of the final vocabulary, including all tokens and alphabet.
  109. min_frequency (:obj:`int`, `optional`):
  110. The minimum frequency a pair should have in order to be merged.
  111. show_progress (:obj:`bool`, `optional`):
  112. Whether to show progress bars while training.
  113. special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
  114. A list of special tokens the model should know of.
  115. limit_alphabet (:obj:`int`, `optional`):
  116. The maximum different characters to keep in the alphabet.
  117. initial_alphabet (:obj:`List[str]`, `optional`):
  118. A list of characters to include in the initial alphabet, even
  119. if not seen in the training dataset.
  120. If the strings contain more than one character, only the first one
  121. is kept.
  122. continuing_subword_prefix (:obj:`str`, `optional`):
  123. A prefix to be used for every subword that is not a beginning-of-word.
  124. end_of_word_suffix (:obj:`str`, `optional`):
  125. A suffix to be used for every subword that is a end-of-word.
  126. """
  127. def __init__(
  128. self,
  129. vocab_size=30000,
  130. min_frequency=0,
  131. show_progress=True,
  132. special_tokens=[],
  133. limit_alphabet=None,
  134. initial_alphabet=[],
  135. continuing_subword_prefix="##",
  136. end_of_word_suffix=None,
  137. ):
  138. pass